1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SmallPtrSet.h"
73 #include "llvm/ADT/SmallSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
91 #include "llvm/Analysis/ProfileSummaryInfo.h"
92 #include "llvm/Analysis/ScalarEvolution.h"
93 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
94 #include "llvm/Analysis/TargetLibraryInfo.h"
95 #include "llvm/Analysis/TargetTransformInfo.h"
96 #include "llvm/Analysis/VectorUtils.h"
97 #include "llvm/IR/Attributes.h"
98 #include "llvm/IR/BasicBlock.h"
99 #include "llvm/IR/CFG.h"
100 #include "llvm/IR/Constant.h"
101 #include "llvm/IR/Constants.h"
102 #include "llvm/IR/DataLayout.h"
103 #include "llvm/IR/DebugInfoMetadata.h"
104 #include "llvm/IR/DebugLoc.h"
105 #include "llvm/IR/DerivedTypes.h"
106 #include "llvm/IR/DiagnosticInfo.h"
107 #include "llvm/IR/Dominators.h"
108 #include "llvm/IR/Function.h"
109 #include "llvm/IR/IRBuilder.h"
110 #include "llvm/IR/InstrTypes.h"
111 #include "llvm/IR/Instruction.h"
112 #include "llvm/IR/Instructions.h"
113 #include "llvm/IR/IntrinsicInst.h"
114 #include "llvm/IR/Intrinsics.h"
115 #include "llvm/IR/Metadata.h"
116 #include "llvm/IR/Module.h"
117 #include "llvm/IR/Operator.h"
118 #include "llvm/IR/PatternMatch.h"
119 #include "llvm/IR/Type.h"
120 #include "llvm/IR/Use.h"
121 #include "llvm/IR/User.h"
122 #include "llvm/IR/Value.h"
123 #include "llvm/IR/ValueHandle.h"
124 #include "llvm/IR/Verifier.h"
125 #include "llvm/InitializePasses.h"
126 #include "llvm/Pass.h"
127 #include "llvm/Support/Casting.h"
128 #include "llvm/Support/CommandLine.h"
129 #include "llvm/Support/Compiler.h"
130 #include "llvm/Support/Debug.h"
131 #include "llvm/Support/ErrorHandling.h"
132 #include "llvm/Support/InstructionCost.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
137 #include "llvm/Transforms/Utils/LoopSimplify.h"
138 #include "llvm/Transforms/Utils/LoopUtils.h"
139 #include "llvm/Transforms/Utils/LoopVersioning.h"
140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143 #include <algorithm>
144 #include <cassert>
145 #include <cstdint>
146 #include <functional>
147 #include <iterator>
148 #include <limits>
149 #include <memory>
150 #include <string>
151 #include <tuple>
152 #include <utility>
153 
154 using namespace llvm;
155 
156 #define LV_NAME "loop-vectorize"
157 #define DEBUG_TYPE LV_NAME
158 
159 #ifndef NDEBUG
160 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
161 #endif
162 
163 /// @{
164 /// Metadata attribute names
165 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
166 const char LLVMLoopVectorizeFollowupVectorized[] =
167     "llvm.loop.vectorize.followup_vectorized";
168 const char LLVMLoopVectorizeFollowupEpilogue[] =
169     "llvm.loop.vectorize.followup_epilogue";
170 /// @}
171 
172 STATISTIC(LoopsVectorized, "Number of loops vectorized");
173 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
174 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
175 
176 static cl::opt<bool> EnableEpilogueVectorization(
177     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
178     cl::desc("Enable vectorization of epilogue loops."));
179 
180 static cl::opt<unsigned> EpilogueVectorizationForceVF(
181     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
182     cl::desc("When epilogue vectorization is enabled, and a value greater than "
183              "1 is specified, forces the given VF for all applicable epilogue "
184              "loops."));
185 
186 static cl::opt<unsigned> EpilogueVectorizationMinVF(
187     "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
188     cl::desc("Only loops with vectorization factor equal to or larger than "
189              "the specified value are considered for epilogue vectorization."));
190 
191 /// Loops with a known constant trip count below this number are vectorized only
192 /// if no scalar iteration overheads are incurred.
193 static cl::opt<unsigned> TinyTripCountVectorThreshold(
194     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
195     cl::desc("Loops with a constant trip count that is smaller than this "
196              "value are vectorized only if no scalar iteration overheads "
197              "are incurred."));
198 
199 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
200     "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
201     cl::desc("The maximum allowed number of runtime memory checks with a "
202              "vectorize(enable) pragma."));
203 
204 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
205 // that predication is preferred, and this lists all options. I.e., the
206 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
207 // and predicate the instructions accordingly. If tail-folding fails, there are
208 // different fallback strategies depending on these values:
209 namespace PreferPredicateTy {
210   enum Option {
211     ScalarEpilogue = 0,
212     PredicateElseScalarEpilogue,
213     PredicateOrDontVectorize
214   };
215 } // namespace PreferPredicateTy
216 
217 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
218     "prefer-predicate-over-epilogue",
219     cl::init(PreferPredicateTy::ScalarEpilogue),
220     cl::Hidden,
221     cl::desc("Tail-folding and predication preferences over creating a scalar "
222              "epilogue loop."),
223     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
224                          "scalar-epilogue",
225                          "Don't tail-predicate loops, create scalar epilogue"),
226               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
227                          "predicate-else-scalar-epilogue",
228                          "prefer tail-folding, create scalar epilogue if tail "
229                          "folding fails."),
230               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
231                          "predicate-dont-vectorize",
232                          "prefers tail-folding, don't attempt vectorization if "
233                          "tail-folding fails.")));
234 
235 static cl::opt<bool> MaximizeBandwidth(
236     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
237     cl::desc("Maximize bandwidth when selecting vectorization factor which "
238              "will be determined by the smallest type in loop."));
239 
240 static cl::opt<bool> EnableInterleavedMemAccesses(
241     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
242     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
243 
244 /// An interleave-group may need masking if it resides in a block that needs
245 /// predication, or in order to mask away gaps.
246 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
247     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
248     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
249 
250 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
251     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
252     cl::desc("We don't interleave loops with a estimated constant trip count "
253              "below this number"));
254 
255 static cl::opt<unsigned> ForceTargetNumScalarRegs(
256     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
257     cl::desc("A flag that overrides the target's number of scalar registers."));
258 
259 static cl::opt<unsigned> ForceTargetNumVectorRegs(
260     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
261     cl::desc("A flag that overrides the target's number of vector registers."));
262 
263 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
264     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
265     cl::desc("A flag that overrides the target's max interleave factor for "
266              "scalar loops."));
267 
268 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
269     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
270     cl::desc("A flag that overrides the target's max interleave factor for "
271              "vectorized loops."));
272 
273 static cl::opt<unsigned> ForceTargetInstructionCost(
274     "force-target-instruction-cost", cl::init(0), cl::Hidden,
275     cl::desc("A flag that overrides the target's expected cost for "
276              "an instruction to a single constant value. Mostly "
277              "useful for getting consistent testing."));
278 
279 static cl::opt<bool> ForceTargetSupportsScalableVectors(
280     "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
281     cl::desc(
282         "Pretend that scalable vectors are supported, even if the target does "
283         "not support them. This flag should only be used for testing."));
284 
285 static cl::opt<unsigned> SmallLoopCost(
286     "small-loop-cost", cl::init(20), cl::Hidden,
287     cl::desc(
288         "The cost of a loop that is considered 'small' by the interleaver."));
289 
290 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
291     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
292     cl::desc("Enable the use of the block frequency analysis to access PGO "
293              "heuristics minimizing code growth in cold regions and being more "
294              "aggressive in hot regions."));
295 
296 // Runtime interleave loops for load/store throughput.
297 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
298     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
299     cl::desc(
300         "Enable runtime interleaving until load/store ports are saturated"));
301 
302 /// Interleave small loops with scalar reductions.
303 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
304     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
305     cl::desc("Enable interleaving for loops with small iteration counts that "
306              "contain scalar reductions to expose ILP."));
307 
308 /// The number of stores in a loop that are allowed to need predication.
309 static cl::opt<unsigned> NumberOfStoresToPredicate(
310     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
311     cl::desc("Max number of stores to be predicated behind an if."));
312 
313 static cl::opt<bool> EnableIndVarRegisterHeur(
314     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
315     cl::desc("Count the induction variable only once when interleaving"));
316 
317 static cl::opt<bool> EnableCondStoresVectorization(
318     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
319     cl::desc("Enable if predication of stores during vectorization."));
320 
321 static cl::opt<unsigned> MaxNestedScalarReductionIC(
322     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
323     cl::desc("The maximum interleave count to use when interleaving a scalar "
324              "reduction in a nested loop."));
325 
326 static cl::opt<bool>
327     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
328                            cl::Hidden,
329                            cl::desc("Prefer in-loop vector reductions, "
330                                     "overriding the targets preference."));
331 
332 static cl::opt<bool> ForceOrderedReductions(
333     "force-ordered-reductions", cl::init(false), cl::Hidden,
334     cl::desc("Enable the vectorisation of loops with in-order (strict) "
335              "FP reductions"));
336 
337 static cl::opt<bool> PreferPredicatedReductionSelect(
338     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
339     cl::desc(
340         "Prefer predicating a reduction operation over an after loop select."));
341 
342 cl::opt<bool> EnableVPlanNativePath(
343     "enable-vplan-native-path", cl::init(false), cl::Hidden,
344     cl::desc("Enable VPlan-native vectorization path with "
345              "support for outer loop vectorization."));
346 
347 // FIXME: Remove this switch once we have divergence analysis. Currently we
348 // assume divergent non-backedge branches when this switch is true.
349 cl::opt<bool> EnableVPlanPredication(
350     "enable-vplan-predication", cl::init(false), cl::Hidden,
351     cl::desc("Enable VPlan-native vectorization path predicator with "
352              "support for outer loop vectorization."));
353 
354 // This flag enables the stress testing of the VPlan H-CFG construction in the
355 // VPlan-native vectorization path. It must be used in conjuction with
356 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
357 // verification of the H-CFGs built.
358 static cl::opt<bool> VPlanBuildStressTest(
359     "vplan-build-stress-test", cl::init(false), cl::Hidden,
360     cl::desc(
361         "Build VPlan for every supported loop nest in the function and bail "
362         "out right after the build (stress test the VPlan H-CFG construction "
363         "in the VPlan-native vectorization path)."));
364 
365 cl::opt<bool> llvm::EnableLoopInterleaving(
366     "interleave-loops", cl::init(true), cl::Hidden,
367     cl::desc("Enable loop interleaving in Loop vectorization passes"));
368 cl::opt<bool> llvm::EnableLoopVectorization(
369     "vectorize-loops", cl::init(true), cl::Hidden,
370     cl::desc("Run the Loop vectorization passes"));
371 
372 cl::opt<bool> PrintVPlansInDotFormat(
373     "vplan-print-in-dot-format", cl::init(false), cl::Hidden,
374     cl::desc("Use dot format instead of plain text when dumping VPlans"));
375 
376 /// A helper function that returns true if the given type is irregular. The
377 /// type is irregular if its allocated size doesn't equal the store size of an
378 /// element of the corresponding vector type.
379 static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
380   // Determine if an array of N elements of type Ty is "bitcast compatible"
381   // with a <N x Ty> vector.
382   // This is only true if there is no padding between the array elements.
383   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
384 }
385 
386 /// A helper function that returns the reciprocal of the block probability of
387 /// predicated blocks. If we return X, we are assuming the predicated block
388 /// will execute once for every X iterations of the loop header.
389 ///
390 /// TODO: We should use actual block probability here, if available. Currently,
391 ///       we always assume predicated blocks have a 50% chance of executing.
392 static unsigned getReciprocalPredBlockProb() { return 2; }
393 
394 /// A helper function that returns an integer or floating-point constant with
395 /// value C.
396 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
397   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
398                            : ConstantFP::get(Ty, C);
399 }
400 
401 /// Returns "best known" trip count for the specified loop \p L as defined by
402 /// the following procedure:
403 ///   1) Returns exact trip count if it is known.
404 ///   2) Returns expected trip count according to profile data if any.
405 ///   3) Returns upper bound estimate if it is known.
406 ///   4) Returns None if all of the above failed.
407 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
408   // Check if exact trip count is known.
409   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
410     return ExpectedTC;
411 
412   // Check if there is an expected trip count available from profile data.
413   if (LoopVectorizeWithBlockFrequency)
414     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
415       return EstimatedTC;
416 
417   // Check if upper bound estimate is known.
418   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
419     return ExpectedTC;
420 
421   return None;
422 }
423 
424 // Forward declare GeneratedRTChecks.
425 class GeneratedRTChecks;
426 
427 namespace llvm {
428 
429 AnalysisKey ShouldRunExtraVectorPasses::Key;
430 
431 /// InnerLoopVectorizer vectorizes loops which contain only one basic
432 /// block to a specified vectorization factor (VF).
433 /// This class performs the widening of scalars into vectors, or multiple
434 /// scalars. This class also implements the following features:
435 /// * It inserts an epilogue loop for handling loops that don't have iteration
436 ///   counts that are known to be a multiple of the vectorization factor.
437 /// * It handles the code generation for reduction variables.
438 /// * Scalarization (implementation using scalars) of un-vectorizable
439 ///   instructions.
440 /// InnerLoopVectorizer does not perform any vectorization-legality
441 /// checks, and relies on the caller to check for the different legality
442 /// aspects. The InnerLoopVectorizer relies on the
443 /// LoopVectorizationLegality class to provide information about the induction
444 /// and reduction variables that were found to a given vectorization factor.
445 class InnerLoopVectorizer {
446 public:
447   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
448                       LoopInfo *LI, DominatorTree *DT,
449                       const TargetLibraryInfo *TLI,
450                       const TargetTransformInfo *TTI, AssumptionCache *AC,
451                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
452                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
453                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
454                       ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
455       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
456         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
457         Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
458         PSI(PSI), RTChecks(RTChecks) {
459     // Query this against the original loop and save it here because the profile
460     // of the original loop header may change as the transformation happens.
461     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
462         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
463   }
464 
465   virtual ~InnerLoopVectorizer() = default;
466 
467   /// Create a new empty loop that will contain vectorized instructions later
468   /// on, while the old loop will be used as the scalar remainder. Control flow
469   /// is generated around the vectorized (and scalar epilogue) loops consisting
470   /// of various checks and bypasses. Return the pre-header block of the new
471   /// loop and the start value for the canonical induction, if it is != 0. The
472   /// latter is the case when vectorizing the epilogue loop. In the case of
473   /// epilogue vectorization, this function is overriden to handle the more
474   /// complex control flow around the loops.
475   virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton();
476 
477   /// Widen a single call instruction within the innermost loop.
478   void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
479                             VPTransformState &State);
480 
481   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
482   void fixVectorizedLoop(VPTransformState &State);
483 
484   // Return true if any runtime check is added.
485   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
486 
487   /// A type for vectorized values in the new loop. Each value from the
488   /// original loop, when vectorized, is represented by UF vector values in the
489   /// new unrolled loop, where UF is the unroll factor.
490   using VectorParts = SmallVector<Value *, 2>;
491 
492   /// Vectorize a single first-order recurrence or pointer induction PHINode in
493   /// a block. This method handles the induction variable canonicalization. It
494   /// supports both VF = 1 for unrolled loops and arbitrary length vectors.
495   void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR,
496                            VPTransformState &State);
497 
498   /// A helper function to scalarize a single Instruction in the innermost loop.
499   /// Generates a sequence of scalar instances for each lane between \p MinLane
500   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
501   /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
502   /// Instr's operands.
503   void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe,
504                             const VPIteration &Instance, bool IfPredicateInstr,
505                             VPTransformState &State);
506 
507   /// Construct the vector value of a scalarized value \p V one lane at a time.
508   void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
509                                  VPTransformState &State);
510 
511   /// Try to vectorize interleaved access group \p Group with the base address
512   /// given in \p Addr, optionally masking the vector operations if \p
513   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
514   /// values in the vectorized loop.
515   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
516                                 ArrayRef<VPValue *> VPDefs,
517                                 VPTransformState &State, VPValue *Addr,
518                                 ArrayRef<VPValue *> StoredValues,
519                                 VPValue *BlockInMask = nullptr);
520 
521   /// Set the debug location in the builder \p Ptr using the debug location in
522   /// \p V. If \p Ptr is None then it uses the class member's Builder.
523   void setDebugLocFromInst(const Value *V,
524                            Optional<IRBuilderBase *> CustomBuilder = None);
525 
526   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
527   void fixNonInductionPHIs(VPTransformState &State);
528 
529   /// Returns true if the reordering of FP operations is not allowed, but we are
530   /// able to vectorize with strict in-order reductions for the given RdxDesc.
531   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
532 
533   /// Create a broadcast instruction. This method generates a broadcast
534   /// instruction (shuffle) for loop invariant values and for the induction
535   /// value. If this is the induction variable then we extend it to N, N+1, ...
536   /// this is needed because each iteration in the loop corresponds to a SIMD
537   /// element.
538   virtual Value *getBroadcastInstrs(Value *V);
539 
540   /// Add metadata from one instruction to another.
541   ///
542   /// This includes both the original MDs from \p From and additional ones (\see
543   /// addNewMetadata).  Use this for *newly created* instructions in the vector
544   /// loop.
545   void addMetadata(Instruction *To, Instruction *From);
546 
547   /// Similar to the previous function but it adds the metadata to a
548   /// vector of instructions.
549   void addMetadata(ArrayRef<Value *> To, Instruction *From);
550 
551   // Returns the resume value (bc.merge.rdx) for a reduction as
552   // generated by fixReduction.
553   PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc);
554 
555 protected:
556   friend class LoopVectorizationPlanner;
557 
558   /// A small list of PHINodes.
559   using PhiVector = SmallVector<PHINode *, 4>;
560 
561   /// A type for scalarized values in the new loop. Each value from the
562   /// original loop, when scalarized, is represented by UF x VF scalar values
563   /// in the new unrolled loop, where UF is the unroll factor and VF is the
564   /// vectorization factor.
565   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
566 
567   /// Set up the values of the IVs correctly when exiting the vector loop.
568   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
569                     Value *CountRoundDown, Value *EndValue,
570                     BasicBlock *MiddleBlock);
571 
572   /// Introduce a conditional branch (on true, condition to be set later) at the
573   /// end of the header=latch connecting it to itself (across the backedge) and
574   /// to the exit block of \p L.
575   void createHeaderBranch(Loop *L);
576 
577   /// Handle all cross-iteration phis in the header.
578   void fixCrossIterationPHIs(VPTransformState &State);
579 
580   /// Create the exit value of first order recurrences in the middle block and
581   /// update their users.
582   void fixFirstOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
583                                VPTransformState &State);
584 
585   /// Create code for the loop exit value of the reduction.
586   void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
587 
588   /// Clear NSW/NUW flags from reduction instructions if necessary.
589   void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
590                                VPTransformState &State);
591 
592   /// Fixup the LCSSA phi nodes in the unique exit block.  This simply
593   /// means we need to add the appropriate incoming value from the middle
594   /// block as exiting edges from the scalar epilogue loop (if present) are
595   /// already in place, and we exit the vector loop exclusively to the middle
596   /// block.
597   void fixLCSSAPHIs(VPTransformState &State);
598 
599   /// Iteratively sink the scalarized operands of a predicated instruction into
600   /// the block that was created for it.
601   void sinkScalarOperands(Instruction *PredInst);
602 
603   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
604   /// represented as.
605   void truncateToMinimalBitwidths(VPTransformState &State);
606 
607   /// Returns (and creates if needed) the original loop trip count.
608   Value *getOrCreateTripCount(Loop *NewLoop);
609 
610   /// Returns (and creates if needed) the trip count of the widened loop.
611   Value *getOrCreateVectorTripCount(Loop *NewLoop);
612 
613   /// Returns a bitcasted value to the requested vector type.
614   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
615   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
616                                 const DataLayout &DL);
617 
618   /// Emit a bypass check to see if the vector trip count is zero, including if
619   /// it overflows.
620   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
621 
622   /// Emit a bypass check to see if all of the SCEV assumptions we've
623   /// had to make are correct. Returns the block containing the checks or
624   /// nullptr if no checks have been added.
625   BasicBlock *emitSCEVChecks(BasicBlock *Bypass);
626 
627   /// Emit bypass checks to check any memory assumptions we may have made.
628   /// Returns the block containing the checks or nullptr if no checks have been
629   /// added.
630   BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
631 
632   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
633   /// vector loop preheader, middle block and scalar preheader. Also
634   /// allocate a loop object for the new vector loop and return it.
635   Loop *createVectorLoopSkeleton(StringRef Prefix);
636 
637   /// Create new phi nodes for the induction variables to resume iteration count
638   /// in the scalar epilogue, from where the vectorized loop left off.
639   /// In cases where the loop skeleton is more complicated (eg. epilogue
640   /// vectorization) and the resume values can come from an additional bypass
641   /// block, the \p AdditionalBypass pair provides information about the bypass
642   /// block and the end value on the edge from bypass to this loop.
643   void createInductionResumeValues(
644       Loop *L,
645       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
646 
647   /// Complete the loop skeleton by adding debug MDs, creating appropriate
648   /// conditional branches in the middle block, preparing the builder and
649   /// running the verifier. Take in the vector loop \p L as argument, and return
650   /// the preheader of the completed vector loop.
651   BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
652 
653   /// Add additional metadata to \p To that was not present on \p Orig.
654   ///
655   /// Currently this is used to add the noalias annotations based on the
656   /// inserted memchecks.  Use this for instructions that are *cloned* into the
657   /// vector loop.
658   void addNewMetadata(Instruction *To, const Instruction *Orig);
659 
660   /// Collect poison-generating recipes that may generate a poison value that is
661   /// used after vectorization, even when their operands are not poison. Those
662   /// recipes meet the following conditions:
663   ///  * Contribute to the address computation of a recipe generating a widen
664   ///    memory load/store (VPWidenMemoryInstructionRecipe or
665   ///    VPInterleaveRecipe).
666   ///  * Such a widen memory load/store has at least one underlying Instruction
667   ///    that is in a basic block that needs predication and after vectorization
668   ///    the generated instruction won't be predicated.
669   void collectPoisonGeneratingRecipes(VPTransformState &State);
670 
671   /// Allow subclasses to override and print debug traces before/after vplan
672   /// execution, when trace information is requested.
673   virtual void printDebugTracesAtStart(){};
674   virtual void printDebugTracesAtEnd(){};
675 
676   /// The original loop.
677   Loop *OrigLoop;
678 
679   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
680   /// dynamic knowledge to simplify SCEV expressions and converts them to a
681   /// more usable form.
682   PredicatedScalarEvolution &PSE;
683 
684   /// Loop Info.
685   LoopInfo *LI;
686 
687   /// Dominator Tree.
688   DominatorTree *DT;
689 
690   /// Alias Analysis.
691   AAResults *AA;
692 
693   /// Target Library Info.
694   const TargetLibraryInfo *TLI;
695 
696   /// Target Transform Info.
697   const TargetTransformInfo *TTI;
698 
699   /// Assumption Cache.
700   AssumptionCache *AC;
701 
702   /// Interface to emit optimization remarks.
703   OptimizationRemarkEmitter *ORE;
704 
705   /// LoopVersioning.  It's only set up (non-null) if memchecks were
706   /// used.
707   ///
708   /// This is currently only used to add no-alias metadata based on the
709   /// memchecks.  The actually versioning is performed manually.
710   std::unique_ptr<LoopVersioning> LVer;
711 
712   /// The vectorization SIMD factor to use. Each vector will have this many
713   /// vector elements.
714   ElementCount VF;
715 
716   /// The vectorization unroll factor to use. Each scalar is vectorized to this
717   /// many different vector instructions.
718   unsigned UF;
719 
720   /// The builder that we use
721   IRBuilder<> Builder;
722 
723   // --- Vectorization state ---
724 
725   /// The vector-loop preheader.
726   BasicBlock *LoopVectorPreHeader;
727 
728   /// The scalar-loop preheader.
729   BasicBlock *LoopScalarPreHeader;
730 
731   /// Middle Block between the vector and the scalar.
732   BasicBlock *LoopMiddleBlock;
733 
734   /// The unique ExitBlock of the scalar loop if one exists.  Note that
735   /// there can be multiple exiting edges reaching this block.
736   BasicBlock *LoopExitBlock;
737 
738   /// The vector loop body.
739   BasicBlock *LoopVectorBody;
740 
741   /// The scalar loop body.
742   BasicBlock *LoopScalarBody;
743 
744   /// A list of all bypass blocks. The first block is the entry of the loop.
745   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
746 
747   /// Store instructions that were predicated.
748   SmallVector<Instruction *, 4> PredicatedInstructions;
749 
750   /// Trip count of the original loop.
751   Value *TripCount = nullptr;
752 
753   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
754   Value *VectorTripCount = nullptr;
755 
756   /// The legality analysis.
757   LoopVectorizationLegality *Legal;
758 
759   /// The profitablity analysis.
760   LoopVectorizationCostModel *Cost;
761 
762   // Record whether runtime checks are added.
763   bool AddedSafetyChecks = false;
764 
765   // Holds the end values for each induction variable. We save the end values
766   // so we can later fix-up the external users of the induction variables.
767   DenseMap<PHINode *, Value *> IVEndValues;
768 
769   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
770   // fixed up at the end of vector code generation.
771   SmallVector<PHINode *, 8> OrigPHIsToFix;
772 
773   /// BFI and PSI are used to check for profile guided size optimizations.
774   BlockFrequencyInfo *BFI;
775   ProfileSummaryInfo *PSI;
776 
777   // Whether this loop should be optimized for size based on profile guided size
778   // optimizatios.
779   bool OptForSizeBasedOnProfile;
780 
781   /// Structure to hold information about generated runtime checks, responsible
782   /// for cleaning the checks, if vectorization turns out unprofitable.
783   GeneratedRTChecks &RTChecks;
784 
785   // Holds the resume values for reductions in the loops, used to set the
786   // correct start value of reduction PHIs when vectorizing the epilogue.
787   SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4>
788       ReductionResumeValues;
789 };
790 
791 class InnerLoopUnroller : public InnerLoopVectorizer {
792 public:
793   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
794                     LoopInfo *LI, DominatorTree *DT,
795                     const TargetLibraryInfo *TLI,
796                     const TargetTransformInfo *TTI, AssumptionCache *AC,
797                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
798                     LoopVectorizationLegality *LVL,
799                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
800                     ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
801       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
802                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
803                             BFI, PSI, Check) {}
804 
805 private:
806   Value *getBroadcastInstrs(Value *V) override;
807 };
808 
809 /// Encapsulate information regarding vectorization of a loop and its epilogue.
810 /// This information is meant to be updated and used across two stages of
811 /// epilogue vectorization.
812 struct EpilogueLoopVectorizationInfo {
813   ElementCount MainLoopVF = ElementCount::getFixed(0);
814   unsigned MainLoopUF = 0;
815   ElementCount EpilogueVF = ElementCount::getFixed(0);
816   unsigned EpilogueUF = 0;
817   BasicBlock *MainLoopIterationCountCheck = nullptr;
818   BasicBlock *EpilogueIterationCountCheck = nullptr;
819   BasicBlock *SCEVSafetyCheck = nullptr;
820   BasicBlock *MemSafetyCheck = nullptr;
821   Value *TripCount = nullptr;
822   Value *VectorTripCount = nullptr;
823 
824   EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
825                                 ElementCount EVF, unsigned EUF)
826       : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
827     assert(EUF == 1 &&
828            "A high UF for the epilogue loop is likely not beneficial.");
829   }
830 };
831 
832 /// An extension of the inner loop vectorizer that creates a skeleton for a
833 /// vectorized loop that has its epilogue (residual) also vectorized.
834 /// The idea is to run the vplan on a given loop twice, firstly to setup the
835 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
836 /// from the first step and vectorize the epilogue.  This is achieved by
837 /// deriving two concrete strategy classes from this base class and invoking
838 /// them in succession from the loop vectorizer planner.
839 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
840 public:
841   InnerLoopAndEpilogueVectorizer(
842       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
843       DominatorTree *DT, const TargetLibraryInfo *TLI,
844       const TargetTransformInfo *TTI, AssumptionCache *AC,
845       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
846       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
847       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
848       GeneratedRTChecks &Checks)
849       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
850                             EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI,
851                             Checks),
852         EPI(EPI) {}
853 
854   // Override this function to handle the more complex control flow around the
855   // three loops.
856   std::pair<BasicBlock *, Value *>
857   createVectorizedLoopSkeleton() final override {
858     return createEpilogueVectorizedLoopSkeleton();
859   }
860 
861   /// The interface for creating a vectorized skeleton using one of two
862   /// different strategies, each corresponding to one execution of the vplan
863   /// as described above.
864   virtual std::pair<BasicBlock *, Value *>
865   createEpilogueVectorizedLoopSkeleton() = 0;
866 
867   /// Holds and updates state information required to vectorize the main loop
868   /// and its epilogue in two separate passes. This setup helps us avoid
869   /// regenerating and recomputing runtime safety checks. It also helps us to
870   /// shorten the iteration-count-check path length for the cases where the
871   /// iteration count of the loop is so small that the main vector loop is
872   /// completely skipped.
873   EpilogueLoopVectorizationInfo &EPI;
874 };
875 
876 /// A specialized derived class of inner loop vectorizer that performs
877 /// vectorization of *main* loops in the process of vectorizing loops and their
878 /// epilogues.
879 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
880 public:
881   EpilogueVectorizerMainLoop(
882       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
883       DominatorTree *DT, const TargetLibraryInfo *TLI,
884       const TargetTransformInfo *TTI, AssumptionCache *AC,
885       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
886       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
887       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
888       GeneratedRTChecks &Check)
889       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
890                                        EPI, LVL, CM, BFI, PSI, Check) {}
891   /// Implements the interface for creating a vectorized skeleton using the
892   /// *main loop* strategy (ie the first pass of vplan execution).
893   std::pair<BasicBlock *, Value *>
894   createEpilogueVectorizedLoopSkeleton() final override;
895 
896 protected:
897   /// Emits an iteration count bypass check once for the main loop (when \p
898   /// ForEpilogue is false) and once for the epilogue loop (when \p
899   /// ForEpilogue is true).
900   BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass,
901                                              bool ForEpilogue);
902   void printDebugTracesAtStart() override;
903   void printDebugTracesAtEnd() override;
904 };
905 
906 // A specialized derived class of inner loop vectorizer that performs
907 // vectorization of *epilogue* loops in the process of vectorizing loops and
908 // their epilogues.
909 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
910 public:
911   EpilogueVectorizerEpilogueLoop(
912       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
913       DominatorTree *DT, const TargetLibraryInfo *TLI,
914       const TargetTransformInfo *TTI, AssumptionCache *AC,
915       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
916       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
917       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
918       GeneratedRTChecks &Checks)
919       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
920                                        EPI, LVL, CM, BFI, PSI, Checks) {}
921   /// Implements the interface for creating a vectorized skeleton using the
922   /// *epilogue loop* strategy (ie the second pass of vplan execution).
923   std::pair<BasicBlock *, Value *>
924   createEpilogueVectorizedLoopSkeleton() final override;
925 
926 protected:
927   /// Emits an iteration count bypass check after the main vector loop has
928   /// finished to see if there are any iterations left to execute by either
929   /// the vector epilogue or the scalar epilogue.
930   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L,
931                                                       BasicBlock *Bypass,
932                                                       BasicBlock *Insert);
933   void printDebugTracesAtStart() override;
934   void printDebugTracesAtEnd() override;
935 };
936 } // end namespace llvm
937 
938 /// Look for a meaningful debug location on the instruction or it's
939 /// operands.
940 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
941   if (!I)
942     return I;
943 
944   DebugLoc Empty;
945   if (I->getDebugLoc() != Empty)
946     return I;
947 
948   for (Use &Op : I->operands()) {
949     if (Instruction *OpInst = dyn_cast<Instruction>(Op))
950       if (OpInst->getDebugLoc() != Empty)
951         return OpInst;
952   }
953 
954   return I;
955 }
956 
957 void InnerLoopVectorizer::setDebugLocFromInst(
958     const Value *V, Optional<IRBuilderBase *> CustomBuilder) {
959   IRBuilderBase *B = (CustomBuilder == None) ? &Builder : *CustomBuilder;
960   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) {
961     const DILocation *DIL = Inst->getDebugLoc();
962 
963     // When a FSDiscriminator is enabled, we don't need to add the multiply
964     // factors to the discriminators.
965     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
966         !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) {
967       // FIXME: For scalable vectors, assume vscale=1.
968       auto NewDIL =
969           DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
970       if (NewDIL)
971         B->SetCurrentDebugLocation(NewDIL.getValue());
972       else
973         LLVM_DEBUG(dbgs()
974                    << "Failed to create new discriminator: "
975                    << DIL->getFilename() << " Line: " << DIL->getLine());
976     } else
977       B->SetCurrentDebugLocation(DIL);
978   } else
979     B->SetCurrentDebugLocation(DebugLoc());
980 }
981 
982 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
983 /// is passed, the message relates to that particular instruction.
984 #ifndef NDEBUG
985 static void debugVectorizationMessage(const StringRef Prefix,
986                                       const StringRef DebugMsg,
987                                       Instruction *I) {
988   dbgs() << "LV: " << Prefix << DebugMsg;
989   if (I != nullptr)
990     dbgs() << " " << *I;
991   else
992     dbgs() << '.';
993   dbgs() << '\n';
994 }
995 #endif
996 
997 /// Create an analysis remark that explains why vectorization failed
998 ///
999 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
1000 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
1001 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
1002 /// the location of the remark.  \return the remark object that can be
1003 /// streamed to.
1004 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
1005     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
1006   Value *CodeRegion = TheLoop->getHeader();
1007   DebugLoc DL = TheLoop->getStartLoc();
1008 
1009   if (I) {
1010     CodeRegion = I->getParent();
1011     // If there is no debug location attached to the instruction, revert back to
1012     // using the loop's.
1013     if (I->getDebugLoc())
1014       DL = I->getDebugLoc();
1015   }
1016 
1017   return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
1018 }
1019 
1020 namespace llvm {
1021 
1022 /// Return a value for Step multiplied by VF.
1023 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
1024                        int64_t Step) {
1025   assert(Ty->isIntegerTy() && "Expected an integer step");
1026   Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue());
1027   return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
1028 }
1029 
1030 /// Return the runtime value for VF.
1031 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
1032   Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
1033   return VF.isScalable() ? B.CreateVScale(EC) : EC;
1034 }
1035 
1036 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
1037                                   ElementCount VF) {
1038   assert(FTy->isFloatingPointTy() && "Expected floating point type!");
1039   Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
1040   Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
1041   return B.CreateUIToFP(RuntimeVF, FTy);
1042 }
1043 
1044 void reportVectorizationFailure(const StringRef DebugMsg,
1045                                 const StringRef OREMsg, const StringRef ORETag,
1046                                 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1047                                 Instruction *I) {
1048   LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
1049   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1050   ORE->emit(
1051       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1052       << "loop not vectorized: " << OREMsg);
1053 }
1054 
1055 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1056                              OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1057                              Instruction *I) {
1058   LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
1059   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1060   ORE->emit(
1061       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1062       << Msg);
1063 }
1064 
1065 } // end namespace llvm
1066 
1067 #ifndef NDEBUG
1068 /// \return string containing a file name and a line # for the given loop.
1069 static std::string getDebugLocString(const Loop *L) {
1070   std::string Result;
1071   if (L) {
1072     raw_string_ostream OS(Result);
1073     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1074       LoopDbgLoc.print(OS);
1075     else
1076       // Just print the module name.
1077       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1078     OS.flush();
1079   }
1080   return Result;
1081 }
1082 #endif
1083 
1084 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
1085                                          const Instruction *Orig) {
1086   // If the loop was versioned with memchecks, add the corresponding no-alias
1087   // metadata.
1088   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
1089     LVer->annotateInstWithNoAlias(To, Orig);
1090 }
1091 
1092 void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
1093     VPTransformState &State) {
1094 
1095   // Collect recipes in the backward slice of `Root` that may generate a poison
1096   // value that is used after vectorization.
1097   SmallPtrSet<VPRecipeBase *, 16> Visited;
1098   auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
1099     SmallVector<VPRecipeBase *, 16> Worklist;
1100     Worklist.push_back(Root);
1101 
1102     // Traverse the backward slice of Root through its use-def chain.
1103     while (!Worklist.empty()) {
1104       VPRecipeBase *CurRec = Worklist.back();
1105       Worklist.pop_back();
1106 
1107       if (!Visited.insert(CurRec).second)
1108         continue;
1109 
1110       // Prune search if we find another recipe generating a widen memory
1111       // instruction. Widen memory instructions involved in address computation
1112       // will lead to gather/scatter instructions, which don't need to be
1113       // handled.
1114       if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
1115           isa<VPInterleaveRecipe>(CurRec) ||
1116           isa<VPScalarIVStepsRecipe>(CurRec) ||
1117           isa<VPCanonicalIVPHIRecipe>(CurRec))
1118         continue;
1119 
1120       // This recipe contributes to the address computation of a widen
1121       // load/store. Collect recipe if its underlying instruction has
1122       // poison-generating flags.
1123       Instruction *Instr = CurRec->getUnderlyingInstr();
1124       if (Instr && Instr->hasPoisonGeneratingFlags())
1125         State.MayGeneratePoisonRecipes.insert(CurRec);
1126 
1127       // Add new definitions to the worklist.
1128       for (VPValue *operand : CurRec->operands())
1129         if (VPDef *OpDef = operand->getDef())
1130           Worklist.push_back(cast<VPRecipeBase>(OpDef));
1131     }
1132   });
1133 
1134   // Traverse all the recipes in the VPlan and collect the poison-generating
1135   // recipes in the backward slice starting at the address of a VPWidenRecipe or
1136   // VPInterleaveRecipe.
1137   auto Iter = depth_first(
1138       VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry()));
1139   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
1140     for (VPRecipeBase &Recipe : *VPBB) {
1141       if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
1142         Instruction *UnderlyingInstr = WidenRec->getUnderlyingInstr();
1143         VPDef *AddrDef = WidenRec->getAddr()->getDef();
1144         if (AddrDef && WidenRec->isConsecutive() && UnderlyingInstr &&
1145             Legal->blockNeedsPredication(UnderlyingInstr->getParent()))
1146           collectPoisonGeneratingInstrsInBackwardSlice(
1147               cast<VPRecipeBase>(AddrDef));
1148       } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
1149         VPDef *AddrDef = InterleaveRec->getAddr()->getDef();
1150         if (AddrDef) {
1151           // Check if any member of the interleave group needs predication.
1152           const InterleaveGroup<Instruction> *InterGroup =
1153               InterleaveRec->getInterleaveGroup();
1154           bool NeedPredication = false;
1155           for (int I = 0, NumMembers = InterGroup->getNumMembers();
1156                I < NumMembers; ++I) {
1157             Instruction *Member = InterGroup->getMember(I);
1158             if (Member)
1159               NeedPredication |=
1160                   Legal->blockNeedsPredication(Member->getParent());
1161           }
1162 
1163           if (NeedPredication)
1164             collectPoisonGeneratingInstrsInBackwardSlice(
1165                 cast<VPRecipeBase>(AddrDef));
1166         }
1167       }
1168     }
1169   }
1170 }
1171 
1172 void InnerLoopVectorizer::addMetadata(Instruction *To,
1173                                       Instruction *From) {
1174   propagateMetadata(To, From);
1175   addNewMetadata(To, From);
1176 }
1177 
1178 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1179                                       Instruction *From) {
1180   for (Value *V : To) {
1181     if (Instruction *I = dyn_cast<Instruction>(V))
1182       addMetadata(I, From);
1183   }
1184 }
1185 
1186 PHINode *InnerLoopVectorizer::getReductionResumeValue(
1187     const RecurrenceDescriptor &RdxDesc) {
1188   auto It = ReductionResumeValues.find(&RdxDesc);
1189   assert(It != ReductionResumeValues.end() &&
1190          "Expected to find a resume value for the reduction.");
1191   return It->second;
1192 }
1193 
1194 namespace llvm {
1195 
1196 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1197 // lowered.
1198 enum ScalarEpilogueLowering {
1199 
1200   // The default: allowing scalar epilogues.
1201   CM_ScalarEpilogueAllowed,
1202 
1203   // Vectorization with OptForSize: don't allow epilogues.
1204   CM_ScalarEpilogueNotAllowedOptSize,
1205 
1206   // A special case of vectorisation with OptForSize: loops with a very small
1207   // trip count are considered for vectorization under OptForSize, thereby
1208   // making sure the cost of their loop body is dominant, free of runtime
1209   // guards and scalar iteration overheads.
1210   CM_ScalarEpilogueNotAllowedLowTripLoop,
1211 
1212   // Loop hint predicate indicating an epilogue is undesired.
1213   CM_ScalarEpilogueNotNeededUsePredicate,
1214 
1215   // Directive indicating we must either tail fold or not vectorize
1216   CM_ScalarEpilogueNotAllowedUsePredicate
1217 };
1218 
1219 /// ElementCountComparator creates a total ordering for ElementCount
1220 /// for the purposes of using it in a set structure.
1221 struct ElementCountComparator {
1222   bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {
1223     return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
1224            std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
1225   }
1226 };
1227 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;
1228 
1229 /// LoopVectorizationCostModel - estimates the expected speedups due to
1230 /// vectorization.
1231 /// In many cases vectorization is not profitable. This can happen because of
1232 /// a number of reasons. In this class we mainly attempt to predict the
1233 /// expected speedup/slowdowns due to the supported instruction set. We use the
1234 /// TargetTransformInfo to query the different backends for the cost of
1235 /// different operations.
1236 class LoopVectorizationCostModel {
1237 public:
1238   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1239                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1240                              LoopVectorizationLegality *Legal,
1241                              const TargetTransformInfo &TTI,
1242                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1243                              AssumptionCache *AC,
1244                              OptimizationRemarkEmitter *ORE, const Function *F,
1245                              const LoopVectorizeHints *Hints,
1246                              InterleavedAccessInfo &IAI)
1247       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1248         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1249         Hints(Hints), InterleaveInfo(IAI) {}
1250 
1251   /// \return An upper bound for the vectorization factors (both fixed and
1252   /// scalable). If the factors are 0, vectorization and interleaving should be
1253   /// avoided up front.
1254   FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1255 
1256   /// \return True if runtime checks are required for vectorization, and false
1257   /// otherwise.
1258   bool runtimeChecksRequired();
1259 
1260   /// \return The most profitable vectorization factor and the cost of that VF.
1261   /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO
1262   /// then this vectorization factor will be selected if vectorization is
1263   /// possible.
1264   VectorizationFactor
1265   selectVectorizationFactor(const ElementCountSet &CandidateVFs);
1266 
1267   VectorizationFactor
1268   selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1269                                     const LoopVectorizationPlanner &LVP);
1270 
1271   /// Setup cost-based decisions for user vectorization factor.
1272   /// \return true if the UserVF is a feasible VF to be chosen.
1273   bool selectUserVectorizationFactor(ElementCount UserVF) {
1274     collectUniformsAndScalars(UserVF);
1275     collectInstsToScalarize(UserVF);
1276     return expectedCost(UserVF).first.isValid();
1277   }
1278 
1279   /// \return The size (in bits) of the smallest and widest types in the code
1280   /// that needs to be vectorized. We ignore values that remain scalar such as
1281   /// 64 bit loop indices.
1282   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1283 
1284   /// \return The desired interleave count.
1285   /// If interleave count has been specified by metadata it will be returned.
1286   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1287   /// are the selected vectorization factor and the cost of the selected VF.
1288   unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1289 
1290   /// Memory access instruction may be vectorized in more than one way.
1291   /// Form of instruction after vectorization depends on cost.
1292   /// This function takes cost-based decisions for Load/Store instructions
1293   /// and collects them in a map. This decisions map is used for building
1294   /// the lists of loop-uniform and loop-scalar instructions.
1295   /// The calculated cost is saved with widening decision in order to
1296   /// avoid redundant calculations.
1297   void setCostBasedWideningDecision(ElementCount VF);
1298 
1299   /// A struct that represents some properties of the register usage
1300   /// of a loop.
1301   struct RegisterUsage {
1302     /// Holds the number of loop invariant values that are used in the loop.
1303     /// The key is ClassID of target-provided register class.
1304     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1305     /// Holds the maximum number of concurrent live intervals in the loop.
1306     /// The key is ClassID of target-provided register class.
1307     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1308   };
1309 
1310   /// \return Returns information about the register usages of the loop for the
1311   /// given vectorization factors.
1312   SmallVector<RegisterUsage, 8>
1313   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1314 
1315   /// Collect values we want to ignore in the cost model.
1316   void collectValuesToIgnore();
1317 
1318   /// Collect all element types in the loop for which widening is needed.
1319   void collectElementTypesForWidening();
1320 
1321   /// Split reductions into those that happen in the loop, and those that happen
1322   /// outside. In loop reductions are collected into InLoopReductionChains.
1323   void collectInLoopReductions();
1324 
1325   /// Returns true if we should use strict in-order reductions for the given
1326   /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1327   /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1328   /// of FP operations.
1329   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) {
1330     return !Hints->allowReordering() && RdxDesc.isOrdered();
1331   }
1332 
1333   /// \returns The smallest bitwidth each instruction can be represented with.
1334   /// The vector equivalents of these instructions should be truncated to this
1335   /// type.
1336   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1337     return MinBWs;
1338   }
1339 
1340   /// \returns True if it is more profitable to scalarize instruction \p I for
1341   /// vectorization factor \p VF.
1342   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1343     assert(VF.isVector() &&
1344            "Profitable to scalarize relevant only for VF > 1.");
1345 
1346     // Cost model is not run in the VPlan-native path - return conservative
1347     // result until this changes.
1348     if (EnableVPlanNativePath)
1349       return false;
1350 
1351     auto Scalars = InstsToScalarize.find(VF);
1352     assert(Scalars != InstsToScalarize.end() &&
1353            "VF not yet analyzed for scalarization profitability");
1354     return Scalars->second.find(I) != Scalars->second.end();
1355   }
1356 
1357   /// Returns true if \p I is known to be uniform after vectorization.
1358   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1359     if (VF.isScalar())
1360       return true;
1361 
1362     // Cost model is not run in the VPlan-native path - return conservative
1363     // result until this changes.
1364     if (EnableVPlanNativePath)
1365       return false;
1366 
1367     auto UniformsPerVF = Uniforms.find(VF);
1368     assert(UniformsPerVF != Uniforms.end() &&
1369            "VF not yet analyzed for uniformity");
1370     return UniformsPerVF->second.count(I);
1371   }
1372 
1373   /// Returns true if \p I is known to be scalar after vectorization.
1374   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1375     if (VF.isScalar())
1376       return true;
1377 
1378     // Cost model is not run in the VPlan-native path - return conservative
1379     // result until this changes.
1380     if (EnableVPlanNativePath)
1381       return false;
1382 
1383     auto ScalarsPerVF = Scalars.find(VF);
1384     assert(ScalarsPerVF != Scalars.end() &&
1385            "Scalar values are not calculated for VF");
1386     return ScalarsPerVF->second.count(I);
1387   }
1388 
1389   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1390   /// for vectorization factor \p VF.
1391   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1392     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1393            !isProfitableToScalarize(I, VF) &&
1394            !isScalarAfterVectorization(I, VF);
1395   }
1396 
1397   /// Decision that was taken during cost calculation for memory instruction.
1398   enum InstWidening {
1399     CM_Unknown,
1400     CM_Widen,         // For consecutive accesses with stride +1.
1401     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1402     CM_Interleave,
1403     CM_GatherScatter,
1404     CM_Scalarize
1405   };
1406 
1407   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1408   /// instruction \p I and vector width \p VF.
1409   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1410                            InstructionCost Cost) {
1411     assert(VF.isVector() && "Expected VF >=2");
1412     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1413   }
1414 
1415   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1416   /// interleaving group \p Grp and vector width \p VF.
1417   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1418                            ElementCount VF, InstWidening W,
1419                            InstructionCost Cost) {
1420     assert(VF.isVector() && "Expected VF >=2");
1421     /// Broadcast this decicion to all instructions inside the group.
1422     /// But the cost will be assigned to one instruction only.
1423     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1424       if (auto *I = Grp->getMember(i)) {
1425         if (Grp->getInsertPos() == I)
1426           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1427         else
1428           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1429       }
1430     }
1431   }
1432 
1433   /// Return the cost model decision for the given instruction \p I and vector
1434   /// width \p VF. Return CM_Unknown if this instruction did not pass
1435   /// through the cost modeling.
1436   InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1437     assert(VF.isVector() && "Expected VF to be a vector VF");
1438     // Cost model is not run in the VPlan-native path - return conservative
1439     // result until this changes.
1440     if (EnableVPlanNativePath)
1441       return CM_GatherScatter;
1442 
1443     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1444     auto Itr = WideningDecisions.find(InstOnVF);
1445     if (Itr == WideningDecisions.end())
1446       return CM_Unknown;
1447     return Itr->second.first;
1448   }
1449 
1450   /// Return the vectorization cost for the given instruction \p I and vector
1451   /// width \p VF.
1452   InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1453     assert(VF.isVector() && "Expected VF >=2");
1454     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1455     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1456            "The cost is not calculated");
1457     return WideningDecisions[InstOnVF].second;
1458   }
1459 
1460   /// Return True if instruction \p I is an optimizable truncate whose operand
1461   /// is an induction variable. Such a truncate will be removed by adding a new
1462   /// induction variable with the destination type.
1463   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1464     // If the instruction is not a truncate, return false.
1465     auto *Trunc = dyn_cast<TruncInst>(I);
1466     if (!Trunc)
1467       return false;
1468 
1469     // Get the source and destination types of the truncate.
1470     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1471     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1472 
1473     // If the truncate is free for the given types, return false. Replacing a
1474     // free truncate with an induction variable would add an induction variable
1475     // update instruction to each iteration of the loop. We exclude from this
1476     // check the primary induction variable since it will need an update
1477     // instruction regardless.
1478     Value *Op = Trunc->getOperand(0);
1479     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1480       return false;
1481 
1482     // If the truncated value is not an induction variable, return false.
1483     return Legal->isInductionPhi(Op);
1484   }
1485 
1486   /// Collects the instructions to scalarize for each predicated instruction in
1487   /// the loop.
1488   void collectInstsToScalarize(ElementCount VF);
1489 
1490   /// Collect Uniform and Scalar values for the given \p VF.
1491   /// The sets depend on CM decision for Load/Store instructions
1492   /// that may be vectorized as interleave, gather-scatter or scalarized.
1493   void collectUniformsAndScalars(ElementCount VF) {
1494     // Do the analysis once.
1495     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1496       return;
1497     setCostBasedWideningDecision(VF);
1498     collectLoopUniforms(VF);
1499     collectLoopScalars(VF);
1500   }
1501 
1502   /// Returns true if the target machine supports masked store operation
1503   /// for the given \p DataType and kind of access to \p Ptr.
1504   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1505     return Legal->isConsecutivePtr(DataType, Ptr) &&
1506            TTI.isLegalMaskedStore(DataType, Alignment);
1507   }
1508 
1509   /// Returns true if the target machine supports masked load operation
1510   /// for the given \p DataType and kind of access to \p Ptr.
1511   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1512     return Legal->isConsecutivePtr(DataType, Ptr) &&
1513            TTI.isLegalMaskedLoad(DataType, Alignment);
1514   }
1515 
1516   /// Returns true if the target machine can represent \p V as a masked gather
1517   /// or scatter operation.
1518   bool isLegalGatherOrScatter(Value *V,
1519                               ElementCount VF = ElementCount::getFixed(1)) {
1520     bool LI = isa<LoadInst>(V);
1521     bool SI = isa<StoreInst>(V);
1522     if (!LI && !SI)
1523       return false;
1524     auto *Ty = getLoadStoreType(V);
1525     Align Align = getLoadStoreAlignment(V);
1526     if (VF.isVector())
1527       Ty = VectorType::get(Ty, VF);
1528     return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1529            (SI && TTI.isLegalMaskedScatter(Ty, Align));
1530   }
1531 
1532   /// Returns true if the target machine supports all of the reduction
1533   /// variables found for the given VF.
1534   bool canVectorizeReductions(ElementCount VF) const {
1535     return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1536       const RecurrenceDescriptor &RdxDesc = Reduction.second;
1537       return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1538     }));
1539   }
1540 
1541   /// Returns true if \p I is an instruction that will be scalarized with
1542   /// predication when vectorizing \p I with vectorization factor \p VF. Such
1543   /// instructions include conditional stores and instructions that may divide
1544   /// by zero.
1545   bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1546 
1547   // Returns true if \p I is an instruction that will be predicated either
1548   // through scalar predication or masked load/store or masked gather/scatter.
1549   // \p VF is the vectorization factor that will be used to vectorize \p I.
1550   // Superset of instructions that return true for isScalarWithPredication.
1551   bool isPredicatedInst(Instruction *I, ElementCount VF,
1552                         bool IsKnownUniform = false) {
1553     // When we know the load is uniform and the original scalar loop was not
1554     // predicated we don't need to mark it as a predicated instruction. Any
1555     // vectorised blocks created when tail-folding are something artificial we
1556     // have introduced and we know there is always at least one active lane.
1557     // That's why we call Legal->blockNeedsPredication here because it doesn't
1558     // query tail-folding.
1559     if (IsKnownUniform && isa<LoadInst>(I) &&
1560         !Legal->blockNeedsPredication(I->getParent()))
1561       return false;
1562     if (!blockNeedsPredicationForAnyReason(I->getParent()))
1563       return false;
1564     // Loads and stores that need some form of masked operation are predicated
1565     // instructions.
1566     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1567       return Legal->isMaskRequired(I);
1568     return isScalarWithPredication(I, VF);
1569   }
1570 
1571   /// Returns true if \p I is a memory instruction with consecutive memory
1572   /// access that can be widened.
1573   bool
1574   memoryInstructionCanBeWidened(Instruction *I,
1575                                 ElementCount VF = ElementCount::getFixed(1));
1576 
1577   /// Returns true if \p I is a memory instruction in an interleaved-group
1578   /// of memory accesses that can be vectorized with wide vector loads/stores
1579   /// and shuffles.
1580   bool
1581   interleavedAccessCanBeWidened(Instruction *I,
1582                                 ElementCount VF = ElementCount::getFixed(1));
1583 
1584   /// Check if \p Instr belongs to any interleaved access group.
1585   bool isAccessInterleaved(Instruction *Instr) {
1586     return InterleaveInfo.isInterleaved(Instr);
1587   }
1588 
1589   /// Get the interleaved access group that \p Instr belongs to.
1590   const InterleaveGroup<Instruction> *
1591   getInterleavedAccessGroup(Instruction *Instr) {
1592     return InterleaveInfo.getInterleaveGroup(Instr);
1593   }
1594 
1595   /// Returns true if we're required to use a scalar epilogue for at least
1596   /// the final iteration of the original loop.
1597   bool requiresScalarEpilogue(ElementCount VF) const {
1598     if (!isScalarEpilogueAllowed())
1599       return false;
1600     // If we might exit from anywhere but the latch, must run the exiting
1601     // iteration in scalar form.
1602     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1603       return true;
1604     return VF.isVector() && InterleaveInfo.requiresScalarEpilogue();
1605   }
1606 
1607   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1608   /// loop hint annotation.
1609   bool isScalarEpilogueAllowed() const {
1610     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1611   }
1612 
1613   /// Returns true if all loop blocks should be masked to fold tail loop.
1614   bool foldTailByMasking() const { return FoldTailByMasking; }
1615 
1616   /// Returns true if the instructions in this block requires predication
1617   /// for any reason, e.g. because tail folding now requires a predicate
1618   /// or because the block in the original loop was predicated.
1619   bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1620     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1621   }
1622 
1623   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1624   /// nodes to the chain of instructions representing the reductions. Uses a
1625   /// MapVector to ensure deterministic iteration order.
1626   using ReductionChainMap =
1627       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1628 
1629   /// Return the chain of instructions representing an inloop reduction.
1630   const ReductionChainMap &getInLoopReductionChains() const {
1631     return InLoopReductionChains;
1632   }
1633 
1634   /// Returns true if the Phi is part of an inloop reduction.
1635   bool isInLoopReduction(PHINode *Phi) const {
1636     return InLoopReductionChains.count(Phi);
1637   }
1638 
1639   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1640   /// with factor VF.  Return the cost of the instruction, including
1641   /// scalarization overhead if it's needed.
1642   InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1643 
1644   /// Estimate cost of a call instruction CI if it were vectorized with factor
1645   /// VF. Return the cost of the instruction, including scalarization overhead
1646   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1647   /// scalarized -
1648   /// i.e. either vector version isn't available, or is too expensive.
1649   InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
1650                                     bool &NeedToScalarize) const;
1651 
1652   /// Returns true if the per-lane cost of VectorizationFactor A is lower than
1653   /// that of B.
1654   bool isMoreProfitable(const VectorizationFactor &A,
1655                         const VectorizationFactor &B) const;
1656 
1657   /// Invalidates decisions already taken by the cost model.
1658   void invalidateCostModelingDecisions() {
1659     WideningDecisions.clear();
1660     Uniforms.clear();
1661     Scalars.clear();
1662   }
1663 
1664 private:
1665   unsigned NumPredStores = 0;
1666 
1667   /// Convenience function that returns the value of vscale_range iff
1668   /// vscale_range.min == vscale_range.max or otherwise returns the value
1669   /// returned by the corresponding TLI method.
1670   Optional<unsigned> getVScaleForTuning() const;
1671 
1672   /// \return An upper bound for the vectorization factors for both
1673   /// fixed and scalable vectorization, where the minimum-known number of
1674   /// elements is a power-of-2 larger than zero. If scalable vectorization is
1675   /// disabled or unsupported, then the scalable part will be equal to
1676   /// ElementCount::getScalable(0).
1677   FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
1678                                            ElementCount UserVF,
1679                                            bool FoldTailByMasking);
1680 
1681   /// \return the maximized element count based on the targets vector
1682   /// registers and the loop trip-count, but limited to a maximum safe VF.
1683   /// This is a helper function of computeFeasibleMaxVF.
1684   /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure
1685   /// issue that occurred on one of the buildbots which cannot be reproduced
1686   /// without having access to the properietary compiler (see comments on
1687   /// D98509). The issue is currently under investigation and this workaround
1688   /// will be removed as soon as possible.
1689   ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
1690                                        unsigned SmallestType,
1691                                        unsigned WidestType,
1692                                        const ElementCount &MaxSafeVF,
1693                                        bool FoldTailByMasking);
1694 
1695   /// \return the maximum legal scalable VF, based on the safe max number
1696   /// of elements.
1697   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1698 
1699   /// The vectorization cost is a combination of the cost itself and a boolean
1700   /// indicating whether any of the contributing operations will actually
1701   /// operate on vector values after type legalization in the backend. If this
1702   /// latter value is false, then all operations will be scalarized (i.e. no
1703   /// vectorization has actually taken place).
1704   using VectorizationCostTy = std::pair<InstructionCost, bool>;
1705 
1706   /// Returns the expected execution cost. The unit of the cost does
1707   /// not matter because we use the 'cost' units to compare different
1708   /// vector widths. The cost that is returned is *not* normalized by
1709   /// the factor width. If \p Invalid is not nullptr, this function
1710   /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1711   /// each instruction that has an Invalid cost for the given VF.
1712   using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1713   VectorizationCostTy
1714   expectedCost(ElementCount VF,
1715                SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
1716 
1717   /// Returns the execution time cost of an instruction for a given vector
1718   /// width. Vector width of one means scalar.
1719   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1720 
1721   /// The cost-computation logic from getInstructionCost which provides
1722   /// the vector type as an output parameter.
1723   InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1724                                      Type *&VectorTy);
1725 
1726   /// Return the cost of instructions in an inloop reduction pattern, if I is
1727   /// part of that pattern.
1728   Optional<InstructionCost>
1729   getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1730                           TTI::TargetCostKind CostKind);
1731 
1732   /// Calculate vectorization cost of memory instruction \p I.
1733   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1734 
1735   /// The cost computation for scalarized memory instruction.
1736   InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1737 
1738   /// The cost computation for interleaving group of memory instructions.
1739   InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1740 
1741   /// The cost computation for Gather/Scatter instruction.
1742   InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1743 
1744   /// The cost computation for widening instruction \p I with consecutive
1745   /// memory access.
1746   InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1747 
1748   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1749   /// Load: scalar load + broadcast.
1750   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1751   /// element)
1752   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1753 
1754   /// Estimate the overhead of scalarizing an instruction. This is a
1755   /// convenience wrapper for the type-based getScalarizationOverhead API.
1756   InstructionCost getScalarizationOverhead(Instruction *I,
1757                                            ElementCount VF) const;
1758 
1759   /// Returns whether the instruction is a load or store and will be a emitted
1760   /// as a vector operation.
1761   bool isConsecutiveLoadOrStore(Instruction *I);
1762 
1763   /// Returns true if an artificially high cost for emulated masked memrefs
1764   /// should be used.
1765   bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1766 
1767   /// Map of scalar integer values to the smallest bitwidth they can be legally
1768   /// represented as. The vector equivalents of these values should be truncated
1769   /// to this type.
1770   MapVector<Instruction *, uint64_t> MinBWs;
1771 
1772   /// A type representing the costs for instructions if they were to be
1773   /// scalarized rather than vectorized. The entries are Instruction-Cost
1774   /// pairs.
1775   using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1776 
1777   /// A set containing all BasicBlocks that are known to present after
1778   /// vectorization as a predicated block.
1779   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1780 
1781   /// Records whether it is allowed to have the original scalar loop execute at
1782   /// least once. This may be needed as a fallback loop in case runtime
1783   /// aliasing/dependence checks fail, or to handle the tail/remainder
1784   /// iterations when the trip count is unknown or doesn't divide by the VF,
1785   /// or as a peel-loop to handle gaps in interleave-groups.
1786   /// Under optsize and when the trip count is very small we don't allow any
1787   /// iterations to execute in the scalar loop.
1788   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1789 
1790   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1791   bool FoldTailByMasking = false;
1792 
1793   /// A map holding scalar costs for different vectorization factors. The
1794   /// presence of a cost for an instruction in the mapping indicates that the
1795   /// instruction will be scalarized when vectorizing with the associated
1796   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1797   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1798 
1799   /// Holds the instructions known to be uniform after vectorization.
1800   /// The data is collected per VF.
1801   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1802 
1803   /// Holds the instructions known to be scalar after vectorization.
1804   /// The data is collected per VF.
1805   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1806 
1807   /// Holds the instructions (address computations) that are forced to be
1808   /// scalarized.
1809   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1810 
1811   /// PHINodes of the reductions that should be expanded in-loop along with
1812   /// their associated chains of reduction operations, in program order from top
1813   /// (PHI) to bottom
1814   ReductionChainMap InLoopReductionChains;
1815 
1816   /// A Map of inloop reduction operations and their immediate chain operand.
1817   /// FIXME: This can be removed once reductions can be costed correctly in
1818   /// vplan. This was added to allow quick lookup to the inloop operations,
1819   /// without having to loop through InLoopReductionChains.
1820   DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1821 
1822   /// Returns the expected difference in cost from scalarizing the expression
1823   /// feeding a predicated instruction \p PredInst. The instructions to
1824   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1825   /// non-negative return value implies the expression will be scalarized.
1826   /// Currently, only single-use chains are considered for scalarization.
1827   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1828                               ElementCount VF);
1829 
1830   /// Collect the instructions that are uniform after vectorization. An
1831   /// instruction is uniform if we represent it with a single scalar value in
1832   /// the vectorized loop corresponding to each vector iteration. Examples of
1833   /// uniform instructions include pointer operands of consecutive or
1834   /// interleaved memory accesses. Note that although uniformity implies an
1835   /// instruction will be scalar, the reverse is not true. In general, a
1836   /// scalarized instruction will be represented by VF scalar values in the
1837   /// vectorized loop, each corresponding to an iteration of the original
1838   /// scalar loop.
1839   void collectLoopUniforms(ElementCount VF);
1840 
1841   /// Collect the instructions that are scalar after vectorization. An
1842   /// instruction is scalar if it is known to be uniform or will be scalarized
1843   /// during vectorization. collectLoopScalars should only add non-uniform nodes
1844   /// to the list if they are used by a load/store instruction that is marked as
1845   /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1846   /// VF values in the vectorized loop, each corresponding to an iteration of
1847   /// the original scalar loop.
1848   void collectLoopScalars(ElementCount VF);
1849 
1850   /// Keeps cost model vectorization decision and cost for instructions.
1851   /// Right now it is used for memory instructions only.
1852   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1853                                 std::pair<InstWidening, InstructionCost>>;
1854 
1855   DecisionList WideningDecisions;
1856 
1857   /// Returns true if \p V is expected to be vectorized and it needs to be
1858   /// extracted.
1859   bool needsExtract(Value *V, ElementCount VF) const {
1860     Instruction *I = dyn_cast<Instruction>(V);
1861     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1862         TheLoop->isLoopInvariant(I))
1863       return false;
1864 
1865     // Assume we can vectorize V (and hence we need extraction) if the
1866     // scalars are not computed yet. This can happen, because it is called
1867     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1868     // the scalars are collected. That should be a safe assumption in most
1869     // cases, because we check if the operands have vectorizable types
1870     // beforehand in LoopVectorizationLegality.
1871     return Scalars.find(VF) == Scalars.end() ||
1872            !isScalarAfterVectorization(I, VF);
1873   };
1874 
1875   /// Returns a range containing only operands needing to be extracted.
1876   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1877                                                    ElementCount VF) const {
1878     return SmallVector<Value *, 4>(make_filter_range(
1879         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1880   }
1881 
1882   /// Determines if we have the infrastructure to vectorize loop \p L and its
1883   /// epilogue, assuming the main loop is vectorized by \p VF.
1884   bool isCandidateForEpilogueVectorization(const Loop &L,
1885                                            const ElementCount VF) const;
1886 
1887   /// Returns true if epilogue vectorization is considered profitable, and
1888   /// false otherwise.
1889   /// \p VF is the vectorization factor chosen for the original loop.
1890   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1891 
1892 public:
1893   /// The loop that we evaluate.
1894   Loop *TheLoop;
1895 
1896   /// Predicated scalar evolution analysis.
1897   PredicatedScalarEvolution &PSE;
1898 
1899   /// Loop Info analysis.
1900   LoopInfo *LI;
1901 
1902   /// Vectorization legality.
1903   LoopVectorizationLegality *Legal;
1904 
1905   /// Vector target information.
1906   const TargetTransformInfo &TTI;
1907 
1908   /// Target Library Info.
1909   const TargetLibraryInfo *TLI;
1910 
1911   /// Demanded bits analysis.
1912   DemandedBits *DB;
1913 
1914   /// Assumption cache.
1915   AssumptionCache *AC;
1916 
1917   /// Interface to emit optimization remarks.
1918   OptimizationRemarkEmitter *ORE;
1919 
1920   const Function *TheFunction;
1921 
1922   /// Loop Vectorize Hint.
1923   const LoopVectorizeHints *Hints;
1924 
1925   /// The interleave access information contains groups of interleaved accesses
1926   /// with the same stride and close to each other.
1927   InterleavedAccessInfo &InterleaveInfo;
1928 
1929   /// Values to ignore in the cost model.
1930   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1931 
1932   /// Values to ignore in the cost model when VF > 1.
1933   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1934 
1935   /// All element types found in the loop.
1936   SmallPtrSet<Type *, 16> ElementTypesInLoop;
1937 
1938   /// Profitable vector factors.
1939   SmallVector<VectorizationFactor, 8> ProfitableVFs;
1940 };
1941 } // end namespace llvm
1942 
1943 /// Helper struct to manage generating runtime checks for vectorization.
1944 ///
1945 /// The runtime checks are created up-front in temporary blocks to allow better
1946 /// estimating the cost and un-linked from the existing IR. After deciding to
1947 /// vectorize, the checks are moved back. If deciding not to vectorize, the
1948 /// temporary blocks are completely removed.
1949 class GeneratedRTChecks {
1950   /// Basic block which contains the generated SCEV checks, if any.
1951   BasicBlock *SCEVCheckBlock = nullptr;
1952 
1953   /// The value representing the result of the generated SCEV checks. If it is
1954   /// nullptr, either no SCEV checks have been generated or they have been used.
1955   Value *SCEVCheckCond = nullptr;
1956 
1957   /// Basic block which contains the generated memory runtime checks, if any.
1958   BasicBlock *MemCheckBlock = nullptr;
1959 
1960   /// The value representing the result of the generated memory runtime checks.
1961   /// If it is nullptr, either no memory runtime checks have been generated or
1962   /// they have been used.
1963   Value *MemRuntimeCheckCond = nullptr;
1964 
1965   DominatorTree *DT;
1966   LoopInfo *LI;
1967 
1968   SCEVExpander SCEVExp;
1969   SCEVExpander MemCheckExp;
1970 
1971 public:
1972   GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1973                     const DataLayout &DL)
1974       : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"),
1975         MemCheckExp(SE, DL, "scev.check") {}
1976 
1977   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1978   /// accurately estimate the cost of the runtime checks. The blocks are
1979   /// un-linked from the IR and is added back during vector code generation. If
1980   /// there is no vector code generation, the check blocks are removed
1981   /// completely.
1982   void Create(Loop *L, const LoopAccessInfo &LAI,
1983               const SCEVPredicate &Pred) {
1984 
1985     BasicBlock *LoopHeader = L->getHeader();
1986     BasicBlock *Preheader = L->getLoopPreheader();
1987 
1988     // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1989     // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1990     // may be used by SCEVExpander. The blocks will be un-linked from their
1991     // predecessors and removed from LI & DT at the end of the function.
1992     if (!Pred.isAlwaysTrue()) {
1993       SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1994                                   nullptr, "vector.scevcheck");
1995 
1996       SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1997           &Pred, SCEVCheckBlock->getTerminator());
1998     }
1999 
2000     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
2001     if (RtPtrChecking.Need) {
2002       auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
2003       MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
2004                                  "vector.memcheck");
2005 
2006       MemRuntimeCheckCond =
2007           addRuntimeChecks(MemCheckBlock->getTerminator(), L,
2008                            RtPtrChecking.getChecks(), MemCheckExp);
2009       assert(MemRuntimeCheckCond &&
2010              "no RT checks generated although RtPtrChecking "
2011              "claimed checks are required");
2012     }
2013 
2014     if (!MemCheckBlock && !SCEVCheckBlock)
2015       return;
2016 
2017     // Unhook the temporary block with the checks, update various places
2018     // accordingly.
2019     if (SCEVCheckBlock)
2020       SCEVCheckBlock->replaceAllUsesWith(Preheader);
2021     if (MemCheckBlock)
2022       MemCheckBlock->replaceAllUsesWith(Preheader);
2023 
2024     if (SCEVCheckBlock) {
2025       SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2026       new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
2027       Preheader->getTerminator()->eraseFromParent();
2028     }
2029     if (MemCheckBlock) {
2030       MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2031       new UnreachableInst(Preheader->getContext(), MemCheckBlock);
2032       Preheader->getTerminator()->eraseFromParent();
2033     }
2034 
2035     DT->changeImmediateDominator(LoopHeader, Preheader);
2036     if (MemCheckBlock) {
2037       DT->eraseNode(MemCheckBlock);
2038       LI->removeBlock(MemCheckBlock);
2039     }
2040     if (SCEVCheckBlock) {
2041       DT->eraseNode(SCEVCheckBlock);
2042       LI->removeBlock(SCEVCheckBlock);
2043     }
2044   }
2045 
2046   /// Remove the created SCEV & memory runtime check blocks & instructions, if
2047   /// unused.
2048   ~GeneratedRTChecks() {
2049     SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2050     SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2051     if (!SCEVCheckCond)
2052       SCEVCleaner.markResultUsed();
2053 
2054     if (!MemRuntimeCheckCond)
2055       MemCheckCleaner.markResultUsed();
2056 
2057     if (MemRuntimeCheckCond) {
2058       auto &SE = *MemCheckExp.getSE();
2059       // Memory runtime check generation creates compares that use expanded
2060       // values. Remove them before running the SCEVExpanderCleaners.
2061       for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2062         if (MemCheckExp.isInsertedInstruction(&I))
2063           continue;
2064         SE.forgetValue(&I);
2065         I.eraseFromParent();
2066       }
2067     }
2068     MemCheckCleaner.cleanup();
2069     SCEVCleaner.cleanup();
2070 
2071     if (SCEVCheckCond)
2072       SCEVCheckBlock->eraseFromParent();
2073     if (MemRuntimeCheckCond)
2074       MemCheckBlock->eraseFromParent();
2075   }
2076 
2077   /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2078   /// adjusts the branches to branch to the vector preheader or \p Bypass,
2079   /// depending on the generated condition.
2080   BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2081                              BasicBlock *LoopVectorPreHeader,
2082                              BasicBlock *LoopExitBlock) {
2083     if (!SCEVCheckCond)
2084       return nullptr;
2085     if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond))
2086       if (C->isZero())
2087         return nullptr;
2088 
2089     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2090 
2091     BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2092     // Create new preheader for vector loop.
2093     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2094       PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2095 
2096     SCEVCheckBlock->getTerminator()->eraseFromParent();
2097     SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2098     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2099                                                 SCEVCheckBlock);
2100 
2101     DT->addNewBlock(SCEVCheckBlock, Pred);
2102     DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2103 
2104     ReplaceInstWithInst(
2105         SCEVCheckBlock->getTerminator(),
2106         BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond));
2107     // Mark the check as used, to prevent it from being removed during cleanup.
2108     SCEVCheckCond = nullptr;
2109     return SCEVCheckBlock;
2110   }
2111 
2112   /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2113   /// the branches to branch to the vector preheader or \p Bypass, depending on
2114   /// the generated condition.
2115   BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass,
2116                                    BasicBlock *LoopVectorPreHeader) {
2117     // Check if we generated code that checks in runtime if arrays overlap.
2118     if (!MemRuntimeCheckCond)
2119       return nullptr;
2120 
2121     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2122     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2123                                                 MemCheckBlock);
2124 
2125     DT->addNewBlock(MemCheckBlock, Pred);
2126     DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2127     MemCheckBlock->moveBefore(LoopVectorPreHeader);
2128 
2129     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2130       PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2131 
2132     ReplaceInstWithInst(
2133         MemCheckBlock->getTerminator(),
2134         BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
2135     MemCheckBlock->getTerminator()->setDebugLoc(
2136         Pred->getTerminator()->getDebugLoc());
2137 
2138     // Mark the check as used, to prevent it from being removed during cleanup.
2139     MemRuntimeCheckCond = nullptr;
2140     return MemCheckBlock;
2141   }
2142 };
2143 
2144 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2145 // vectorization. The loop needs to be annotated with #pragma omp simd
2146 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2147 // vector length information is not provided, vectorization is not considered
2148 // explicit. Interleave hints are not allowed either. These limitations will be
2149 // relaxed in the future.
2150 // Please, note that we are currently forced to abuse the pragma 'clang
2151 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2152 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2153 // provides *explicit vectorization hints* (LV can bypass legal checks and
2154 // assume that vectorization is legal). However, both hints are implemented
2155 // using the same metadata (llvm.loop.vectorize, processed by
2156 // LoopVectorizeHints). This will be fixed in the future when the native IR
2157 // representation for pragma 'omp simd' is introduced.
2158 static bool isExplicitVecOuterLoop(Loop *OuterLp,
2159                                    OptimizationRemarkEmitter *ORE) {
2160   assert(!OuterLp->isInnermost() && "This is not an outer loop");
2161   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2162 
2163   // Only outer loops with an explicit vectorization hint are supported.
2164   // Unannotated outer loops are ignored.
2165   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2166     return false;
2167 
2168   Function *Fn = OuterLp->getHeader()->getParent();
2169   if (!Hints.allowVectorization(Fn, OuterLp,
2170                                 true /*VectorizeOnlyWhenForced*/)) {
2171     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2172     return false;
2173   }
2174 
2175   if (Hints.getInterleave() > 1) {
2176     // TODO: Interleave support is future work.
2177     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2178                          "outer loops.\n");
2179     Hints.emitRemarkWithHints();
2180     return false;
2181   }
2182 
2183   return true;
2184 }
2185 
2186 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2187                                   OptimizationRemarkEmitter *ORE,
2188                                   SmallVectorImpl<Loop *> &V) {
2189   // Collect inner loops and outer loops without irreducible control flow. For
2190   // now, only collect outer loops that have explicit vectorization hints. If we
2191   // are stress testing the VPlan H-CFG construction, we collect the outermost
2192   // loop of every loop nest.
2193   if (L.isInnermost() || VPlanBuildStressTest ||
2194       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2195     LoopBlocksRPO RPOT(&L);
2196     RPOT.perform(LI);
2197     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2198       V.push_back(&L);
2199       // TODO: Collect inner loops inside marked outer loops in case
2200       // vectorization fails for the outer loop. Do not invoke
2201       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2202       // already known to be reducible. We can use an inherited attribute for
2203       // that.
2204       return;
2205     }
2206   }
2207   for (Loop *InnerL : L)
2208     collectSupportedLoops(*InnerL, LI, ORE, V);
2209 }
2210 
2211 namespace {
2212 
2213 /// The LoopVectorize Pass.
2214 struct LoopVectorize : public FunctionPass {
2215   /// Pass identification, replacement for typeid
2216   static char ID;
2217 
2218   LoopVectorizePass Impl;
2219 
2220   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
2221                          bool VectorizeOnlyWhenForced = false)
2222       : FunctionPass(ID),
2223         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
2224     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
2225   }
2226 
2227   bool runOnFunction(Function &F) override {
2228     if (skipFunction(F))
2229       return false;
2230 
2231     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2232     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2233     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2234     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2235     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2236     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2237     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
2238     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2239     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2240     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
2241     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2242     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2243     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
2244 
2245     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
2246         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
2247 
2248     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
2249                         GetLAA, *ORE, PSI).MadeAnyChange;
2250   }
2251 
2252   void getAnalysisUsage(AnalysisUsage &AU) const override {
2253     AU.addRequired<AssumptionCacheTracker>();
2254     AU.addRequired<BlockFrequencyInfoWrapperPass>();
2255     AU.addRequired<DominatorTreeWrapperPass>();
2256     AU.addRequired<LoopInfoWrapperPass>();
2257     AU.addRequired<ScalarEvolutionWrapperPass>();
2258     AU.addRequired<TargetTransformInfoWrapperPass>();
2259     AU.addRequired<AAResultsWrapperPass>();
2260     AU.addRequired<LoopAccessLegacyAnalysis>();
2261     AU.addRequired<DemandedBitsWrapperPass>();
2262     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
2263     AU.addRequired<InjectTLIMappingsLegacy>();
2264 
2265     // We currently do not preserve loopinfo/dominator analyses with outer loop
2266     // vectorization. Until this is addressed, mark these analyses as preserved
2267     // only for non-VPlan-native path.
2268     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
2269     if (!EnableVPlanNativePath) {
2270       AU.addPreserved<LoopInfoWrapperPass>();
2271       AU.addPreserved<DominatorTreeWrapperPass>();
2272     }
2273 
2274     AU.addPreserved<BasicAAWrapperPass>();
2275     AU.addPreserved<GlobalsAAWrapperPass>();
2276     AU.addRequired<ProfileSummaryInfoWrapperPass>();
2277   }
2278 };
2279 
2280 } // end anonymous namespace
2281 
2282 //===----------------------------------------------------------------------===//
2283 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2284 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2285 //===----------------------------------------------------------------------===//
2286 
2287 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2288   // We need to place the broadcast of invariant variables outside the loop,
2289   // but only if it's proven safe to do so. Else, broadcast will be inside
2290   // vector loop body.
2291   Instruction *Instr = dyn_cast<Instruction>(V);
2292   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
2293                      (!Instr ||
2294                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
2295   // Place the code for broadcasting invariant variables in the new preheader.
2296   IRBuilder<>::InsertPointGuard Guard(Builder);
2297   if (SafeToHoist)
2298     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2299 
2300   // Broadcast the scalar into all locations in the vector.
2301   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2302 
2303   return Shuf;
2304 }
2305 
2306 /// This function adds
2307 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
2308 /// to each vector element of Val. The sequence starts at StartIndex.
2309 /// \p Opcode is relevant for FP induction variable.
2310 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
2311                             Instruction::BinaryOps BinOp, ElementCount VF,
2312                             IRBuilderBase &Builder) {
2313   assert(VF.isVector() && "only vector VFs are supported");
2314 
2315   // Create and check the types.
2316   auto *ValVTy = cast<VectorType>(Val->getType());
2317   ElementCount VLen = ValVTy->getElementCount();
2318 
2319   Type *STy = Val->getType()->getScalarType();
2320   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2321          "Induction Step must be an integer or FP");
2322   assert(Step->getType() == STy && "Step has wrong type");
2323 
2324   SmallVector<Constant *, 8> Indices;
2325 
2326   // Create a vector of consecutive numbers from zero to VF.
2327   VectorType *InitVecValVTy = ValVTy;
2328   if (STy->isFloatingPointTy()) {
2329     Type *InitVecValSTy =
2330         IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
2331     InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
2332   }
2333   Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
2334 
2335   // Splat the StartIdx
2336   Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
2337 
2338   if (STy->isIntegerTy()) {
2339     InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
2340     Step = Builder.CreateVectorSplat(VLen, Step);
2341     assert(Step->getType() == Val->getType() && "Invalid step vec");
2342     // FIXME: The newly created binary instructions should contain nsw/nuw
2343     // flags, which can be found from the original scalar operations.
2344     Step = Builder.CreateMul(InitVec, Step);
2345     return Builder.CreateAdd(Val, Step, "induction");
2346   }
2347 
2348   // Floating point induction.
2349   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2350          "Binary Opcode should be specified for FP induction");
2351   InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
2352   InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
2353 
2354   Step = Builder.CreateVectorSplat(VLen, Step);
2355   Value *MulOp = Builder.CreateFMul(InitVec, Step);
2356   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2357 }
2358 
2359 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
2360 /// variable on which to base the steps, \p Step is the size of the step.
2361 static void buildScalarSteps(Value *ScalarIV, Value *Step,
2362                              const InductionDescriptor &ID, VPValue *Def,
2363                              VPTransformState &State) {
2364   IRBuilderBase &Builder = State.Builder;
2365   // We shouldn't have to build scalar steps if we aren't vectorizing.
2366   assert(State.VF.isVector() && "VF should be greater than one");
2367   // Get the value type and ensure it and the step have the same integer type.
2368   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2369   assert(ScalarIVTy == Step->getType() &&
2370          "Val and Step should have the same type");
2371 
2372   // We build scalar steps for both integer and floating-point induction
2373   // variables. Here, we determine the kind of arithmetic we will perform.
2374   Instruction::BinaryOps AddOp;
2375   Instruction::BinaryOps MulOp;
2376   if (ScalarIVTy->isIntegerTy()) {
2377     AddOp = Instruction::Add;
2378     MulOp = Instruction::Mul;
2379   } else {
2380     AddOp = ID.getInductionOpcode();
2381     MulOp = Instruction::FMul;
2382   }
2383 
2384   // Determine the number of scalars we need to generate for each unroll
2385   // iteration.
2386   bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def);
2387   unsigned Lanes = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
2388   // Compute the scalar steps and save the results in State.
2389   Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2390                                      ScalarIVTy->getScalarSizeInBits());
2391   Type *VecIVTy = nullptr;
2392   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2393   if (!FirstLaneOnly && State.VF.isScalable()) {
2394     VecIVTy = VectorType::get(ScalarIVTy, State.VF);
2395     UnitStepVec =
2396         Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
2397     SplatStep = Builder.CreateVectorSplat(State.VF, Step);
2398     SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV);
2399   }
2400 
2401   for (unsigned Part = 0; Part < State.UF; ++Part) {
2402     Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
2403 
2404     if (!FirstLaneOnly && State.VF.isScalable()) {
2405       auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
2406       auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2407       if (ScalarIVTy->isFloatingPointTy())
2408         InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2409       auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2410       auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2411       State.set(Def, Add, Part);
2412       // It's useful to record the lane values too for the known minimum number
2413       // of elements so we do those below. This improves the code quality when
2414       // trying to extract the first element, for example.
2415     }
2416 
2417     if (ScalarIVTy->isFloatingPointTy())
2418       StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);
2419 
2420     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2421       Value *StartIdx = Builder.CreateBinOp(
2422           AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
2423       // The step returned by `createStepForVF` is a runtime-evaluated value
2424       // when VF is scalable. Otherwise, it should be folded into a Constant.
2425       assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
2426              "Expected StartIdx to be folded to a constant when VF is not "
2427              "scalable");
2428       auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2429       auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
2430       State.set(Def, Add, VPIteration(Part, Lane));
2431     }
2432   }
2433 }
2434 
2435 // Generate code for the induction step. Note that induction steps are
2436 // required to be loop-invariant
2437 static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE,
2438                               Instruction *InsertBefore,
2439                               Loop *OrigLoop = nullptr) {
2440   const DataLayout &DL = SE.getDataLayout();
2441   assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) &&
2442          "Induction step should be loop invariant");
2443   if (auto *E = dyn_cast<SCEVUnknown>(Step))
2444     return E->getValue();
2445 
2446   SCEVExpander Exp(SE, DL, "induction");
2447   return Exp.expandCodeFor(Step, Step->getType(), InsertBefore);
2448 }
2449 
2450 /// Compute the transformed value of Index at offset StartValue using step
2451 /// StepValue.
2452 /// For integer induction, returns StartValue + Index * StepValue.
2453 /// For pointer induction, returns StartValue[Index * StepValue].
2454 /// FIXME: The newly created binary instructions should contain nsw/nuw
2455 /// flags, which can be found from the original scalar operations.
2456 static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index,
2457                                    Value *StartValue, Value *Step,
2458                                    const InductionDescriptor &ID) {
2459   assert(Index->getType()->getScalarType() == Step->getType() &&
2460          "Index scalar type does not match StepValue type");
2461 
2462   // Note: the IR at this point is broken. We cannot use SE to create any new
2463   // SCEV and then expand it, hoping that SCEV's simplification will give us
2464   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2465   // lead to various SCEV crashes. So all we can do is to use builder and rely
2466   // on InstCombine for future simplifications. Here we handle some trivial
2467   // cases only.
2468   auto CreateAdd = [&B](Value *X, Value *Y) {
2469     assert(X->getType() == Y->getType() && "Types don't match!");
2470     if (auto *CX = dyn_cast<ConstantInt>(X))
2471       if (CX->isZero())
2472         return Y;
2473     if (auto *CY = dyn_cast<ConstantInt>(Y))
2474       if (CY->isZero())
2475         return X;
2476     return B.CreateAdd(X, Y);
2477   };
2478 
2479   // We allow X to be a vector type, in which case Y will potentially be
2480   // splatted into a vector with the same element count.
2481   auto CreateMul = [&B](Value *X, Value *Y) {
2482     assert(X->getType()->getScalarType() == Y->getType() &&
2483            "Types don't match!");
2484     if (auto *CX = dyn_cast<ConstantInt>(X))
2485       if (CX->isOne())
2486         return Y;
2487     if (auto *CY = dyn_cast<ConstantInt>(Y))
2488       if (CY->isOne())
2489         return X;
2490     VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2491     if (XVTy && !isa<VectorType>(Y->getType()))
2492       Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2493     return B.CreateMul(X, Y);
2494   };
2495 
2496   switch (ID.getKind()) {
2497   case InductionDescriptor::IK_IntInduction: {
2498     assert(!isa<VectorType>(Index->getType()) &&
2499            "Vector indices not supported for integer inductions yet");
2500     assert(Index->getType() == StartValue->getType() &&
2501            "Index type does not match StartValue type");
2502     if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2503       return B.CreateSub(StartValue, Index);
2504     auto *Offset = CreateMul(Index, Step);
2505     return CreateAdd(StartValue, Offset);
2506   }
2507   case InductionDescriptor::IK_PtrInduction: {
2508     assert(isa<Constant>(Step) &&
2509            "Expected constant step for pointer induction");
2510     return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step));
2511   }
2512   case InductionDescriptor::IK_FpInduction: {
2513     assert(!isa<VectorType>(Index->getType()) &&
2514            "Vector indices not supported for FP inductions yet");
2515     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2516     auto InductionBinOp = ID.getInductionBinOp();
2517     assert(InductionBinOp &&
2518            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2519             InductionBinOp->getOpcode() == Instruction::FSub) &&
2520            "Original bin op should be defined for FP induction");
2521 
2522     Value *MulExp = B.CreateFMul(Step, Index);
2523     return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2524                          "induction");
2525   }
2526   case InductionDescriptor::IK_NoInduction:
2527     return nullptr;
2528   }
2529   llvm_unreachable("invalid enum");
2530 }
2531 
2532 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
2533                                                     const VPIteration &Instance,
2534                                                     VPTransformState &State) {
2535   Value *ScalarInst = State.get(Def, Instance);
2536   Value *VectorValue = State.get(Def, Instance.Part);
2537   VectorValue = Builder.CreateInsertElement(
2538       VectorValue, ScalarInst,
2539       Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
2540   State.set(Def, VectorValue, Instance.Part);
2541 }
2542 
2543 // Return whether we allow using masked interleave-groups (for dealing with
2544 // strided loads/stores that reside in predicated blocks, or for dealing
2545 // with gaps).
2546 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2547   // If an override option has been passed in for interleaved accesses, use it.
2548   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2549     return EnableMaskedInterleavedMemAccesses;
2550 
2551   return TTI.enableMaskedInterleavedAccessVectorization();
2552 }
2553 
2554 // Try to vectorize the interleave group that \p Instr belongs to.
2555 //
2556 // E.g. Translate following interleaved load group (factor = 3):
2557 //   for (i = 0; i < N; i+=3) {
2558 //     R = Pic[i];             // Member of index 0
2559 //     G = Pic[i+1];           // Member of index 1
2560 //     B = Pic[i+2];           // Member of index 2
2561 //     ... // do something to R, G, B
2562 //   }
2563 // To:
2564 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2565 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2566 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2567 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
2568 //
2569 // Or translate following interleaved store group (factor = 3):
2570 //   for (i = 0; i < N; i+=3) {
2571 //     ... do something to R, G, B
2572 //     Pic[i]   = R;           // Member of index 0
2573 //     Pic[i+1] = G;           // Member of index 1
2574 //     Pic[i+2] = B;           // Member of index 2
2575 //   }
2576 // To:
2577 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2578 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2579 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2580 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2581 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2582 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2583     const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2584     VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2585     VPValue *BlockInMask) {
2586   Instruction *Instr = Group->getInsertPos();
2587   const DataLayout &DL = Instr->getModule()->getDataLayout();
2588 
2589   // Prepare for the vector type of the interleaved load/store.
2590   Type *ScalarTy = getLoadStoreType(Instr);
2591   unsigned InterleaveFactor = Group->getFactor();
2592   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2593   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2594 
2595   // Prepare for the new pointers.
2596   SmallVector<Value *, 2> AddrParts;
2597   unsigned Index = Group->getIndex(Instr);
2598 
2599   // TODO: extend the masked interleaved-group support to reversed access.
2600   assert((!BlockInMask || !Group->isReverse()) &&
2601          "Reversed masked interleave-group not supported.");
2602 
2603   // If the group is reverse, adjust the index to refer to the last vector lane
2604   // instead of the first. We adjust the index from the first vector lane,
2605   // rather than directly getting the pointer for lane VF - 1, because the
2606   // pointer operand of the interleaved access is supposed to be uniform. For
2607   // uniform instructions, we're only required to generate a value for the
2608   // first vector lane in each unroll iteration.
2609   if (Group->isReverse())
2610     Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2611 
2612   for (unsigned Part = 0; Part < UF; Part++) {
2613     Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2614     setDebugLocFromInst(AddrPart);
2615 
2616     // Notice current instruction could be any index. Need to adjust the address
2617     // to the member of index 0.
2618     //
2619     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2620     //       b = A[i];       // Member of index 0
2621     // Current pointer is pointed to A[i+1], adjust it to A[i].
2622     //
2623     // E.g.  A[i+1] = a;     // Member of index 1
2624     //       A[i]   = b;     // Member of index 0
2625     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2626     // Current pointer is pointed to A[i+2], adjust it to A[i].
2627 
2628     bool InBounds = false;
2629     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2630       InBounds = gep->isInBounds();
2631     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2632     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2633 
2634     // Cast to the vector pointer type.
2635     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2636     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2637     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2638   }
2639 
2640   setDebugLocFromInst(Instr);
2641   Value *PoisonVec = PoisonValue::get(VecTy);
2642 
2643   Value *MaskForGaps = nullptr;
2644   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2645     MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2646     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2647   }
2648 
2649   // Vectorize the interleaved load group.
2650   if (isa<LoadInst>(Instr)) {
2651     // For each unroll part, create a wide load for the group.
2652     SmallVector<Value *, 2> NewLoads;
2653     for (unsigned Part = 0; Part < UF; Part++) {
2654       Instruction *NewLoad;
2655       if (BlockInMask || MaskForGaps) {
2656         assert(useMaskedInterleavedAccesses(*TTI) &&
2657                "masked interleaved groups are not allowed.");
2658         Value *GroupMask = MaskForGaps;
2659         if (BlockInMask) {
2660           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2661           Value *ShuffledMask = Builder.CreateShuffleVector(
2662               BlockInMaskPart,
2663               createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2664               "interleaved.mask");
2665           GroupMask = MaskForGaps
2666                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2667                                                 MaskForGaps)
2668                           : ShuffledMask;
2669         }
2670         NewLoad =
2671             Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2672                                      GroupMask, PoisonVec, "wide.masked.vec");
2673       }
2674       else
2675         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2676                                             Group->getAlign(), "wide.vec");
2677       Group->addMetadata(NewLoad);
2678       NewLoads.push_back(NewLoad);
2679     }
2680 
2681     // For each member in the group, shuffle out the appropriate data from the
2682     // wide loads.
2683     unsigned J = 0;
2684     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2685       Instruction *Member = Group->getMember(I);
2686 
2687       // Skip the gaps in the group.
2688       if (!Member)
2689         continue;
2690 
2691       auto StrideMask =
2692           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2693       for (unsigned Part = 0; Part < UF; Part++) {
2694         Value *StridedVec = Builder.CreateShuffleVector(
2695             NewLoads[Part], StrideMask, "strided.vec");
2696 
2697         // If this member has different type, cast the result type.
2698         if (Member->getType() != ScalarTy) {
2699           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2700           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2701           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2702         }
2703 
2704         if (Group->isReverse())
2705           StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2706 
2707         State.set(VPDefs[J], StridedVec, Part);
2708       }
2709       ++J;
2710     }
2711     return;
2712   }
2713 
2714   // The sub vector type for current instruction.
2715   auto *SubVT = VectorType::get(ScalarTy, VF);
2716 
2717   // Vectorize the interleaved store group.
2718   MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2719   assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
2720          "masked interleaved groups are not allowed.");
2721   assert((!MaskForGaps || !VF.isScalable()) &&
2722          "masking gaps for scalable vectors is not yet supported.");
2723   for (unsigned Part = 0; Part < UF; Part++) {
2724     // Collect the stored vector from each member.
2725     SmallVector<Value *, 4> StoredVecs;
2726     for (unsigned i = 0; i < InterleaveFactor; i++) {
2727       assert((Group->getMember(i) || MaskForGaps) &&
2728              "Fail to get a member from an interleaved store group");
2729       Instruction *Member = Group->getMember(i);
2730 
2731       // Skip the gaps in the group.
2732       if (!Member) {
2733         Value *Undef = PoisonValue::get(SubVT);
2734         StoredVecs.push_back(Undef);
2735         continue;
2736       }
2737 
2738       Value *StoredVec = State.get(StoredValues[i], Part);
2739 
2740       if (Group->isReverse())
2741         StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");
2742 
2743       // If this member has different type, cast it to a unified type.
2744 
2745       if (StoredVec->getType() != SubVT)
2746         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2747 
2748       StoredVecs.push_back(StoredVec);
2749     }
2750 
2751     // Concatenate all vectors into a wide vector.
2752     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2753 
2754     // Interleave the elements in the wide vector.
2755     Value *IVec = Builder.CreateShuffleVector(
2756         WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2757         "interleaved.vec");
2758 
2759     Instruction *NewStoreInstr;
2760     if (BlockInMask || MaskForGaps) {
2761       Value *GroupMask = MaskForGaps;
2762       if (BlockInMask) {
2763         Value *BlockInMaskPart = State.get(BlockInMask, Part);
2764         Value *ShuffledMask = Builder.CreateShuffleVector(
2765             BlockInMaskPart,
2766             createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2767             "interleaved.mask");
2768         GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And,
2769                                                       ShuffledMask, MaskForGaps)
2770                                 : ShuffledMask;
2771       }
2772       NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2773                                                 Group->getAlign(), GroupMask);
2774     } else
2775       NewStoreInstr =
2776           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2777 
2778     Group->addMetadata(NewStoreInstr);
2779   }
2780 }
2781 
2782 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2783                                                VPReplicateRecipe *RepRecipe,
2784                                                const VPIteration &Instance,
2785                                                bool IfPredicateInstr,
2786                                                VPTransformState &State) {
2787   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2788 
2789   // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2790   // the first lane and part.
2791   if (isa<NoAliasScopeDeclInst>(Instr))
2792     if (!Instance.isFirstIteration())
2793       return;
2794 
2795   setDebugLocFromInst(Instr);
2796 
2797   // Does this instruction return a value ?
2798   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2799 
2800   Instruction *Cloned = Instr->clone();
2801   if (!IsVoidRetTy)
2802     Cloned->setName(Instr->getName() + ".cloned");
2803 
2804   // If the scalarized instruction contributes to the address computation of a
2805   // widen masked load/store which was in a basic block that needed predication
2806   // and is not predicated after vectorization, we can't propagate
2807   // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized
2808   // instruction could feed a poison value to the base address of the widen
2809   // load/store.
2810   if (State.MayGeneratePoisonRecipes.contains(RepRecipe))
2811     Cloned->dropPoisonGeneratingFlags();
2812 
2813   State.Builder.SetInsertPoint(Builder.GetInsertBlock(),
2814                                Builder.GetInsertPoint());
2815   // Replace the operands of the cloned instructions with their scalar
2816   // equivalents in the new loop.
2817   for (auto &I : enumerate(RepRecipe->operands())) {
2818     auto InputInstance = Instance;
2819     VPValue *Operand = I.value();
2820     VPReplicateRecipe *OperandR = dyn_cast<VPReplicateRecipe>(Operand);
2821     if (OperandR && OperandR->isUniform())
2822       InputInstance.Lane = VPLane::getFirstLane();
2823     Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2824   }
2825   addNewMetadata(Cloned, Instr);
2826 
2827   // Place the cloned scalar in the new loop.
2828   Builder.Insert(Cloned);
2829 
2830   State.set(RepRecipe, Cloned, Instance);
2831 
2832   // If we just cloned a new assumption, add it the assumption cache.
2833   if (auto *II = dyn_cast<AssumeInst>(Cloned))
2834     AC->registerAssumption(II);
2835 
2836   // End if-block.
2837   if (IfPredicateInstr)
2838     PredicatedInstructions.push_back(Cloned);
2839 }
2840 
2841 void InnerLoopVectorizer::createHeaderBranch(Loop *L) {
2842   BasicBlock *Header = L->getHeader();
2843   assert(!L->getLoopLatch() && "loop should not have a latch at this point");
2844 
2845   IRBuilder<> B(Header->getTerminator());
2846   Instruction *OldInst =
2847       getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
2848   setDebugLocFromInst(OldInst, &B);
2849 
2850   // Connect the header to the exit and header blocks and replace the old
2851   // terminator.
2852   B.CreateCondBr(B.getTrue(), L->getUniqueExitBlock(), Header);
2853 
2854   // Now we have two terminators. Remove the old one from the block.
2855   Header->getTerminator()->eraseFromParent();
2856 }
2857 
2858 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2859   if (TripCount)
2860     return TripCount;
2861 
2862   assert(L && "Create Trip Count for null loop.");
2863   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2864   // Find the loop boundaries.
2865   ScalarEvolution *SE = PSE.getSE();
2866   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2867   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
2868          "Invalid loop count");
2869 
2870   Type *IdxTy = Legal->getWidestInductionType();
2871   assert(IdxTy && "No type for induction");
2872 
2873   // The exit count might have the type of i64 while the phi is i32. This can
2874   // happen if we have an induction variable that is sign extended before the
2875   // compare. The only way that we get a backedge taken count is that the
2876   // induction variable was signed and as such will not overflow. In such a case
2877   // truncation is legal.
2878   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
2879       IdxTy->getPrimitiveSizeInBits())
2880     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2881   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2882 
2883   // Get the total trip count from the count by adding 1.
2884   const SCEV *ExitCount = SE->getAddExpr(
2885       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2886 
2887   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2888 
2889   // Expand the trip count and place the new instructions in the preheader.
2890   // Notice that the pre-header does not change, only the loop body.
2891   SCEVExpander Exp(*SE, DL, "induction");
2892 
2893   // Count holds the overall loop count (N).
2894   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2895                                 L->getLoopPreheader()->getTerminator());
2896 
2897   if (TripCount->getType()->isPointerTy())
2898     TripCount =
2899         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2900                                     L->getLoopPreheader()->getTerminator());
2901 
2902   return TripCount;
2903 }
2904 
2905 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2906   if (VectorTripCount)
2907     return VectorTripCount;
2908 
2909   Value *TC = getOrCreateTripCount(L);
2910   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2911 
2912   Type *Ty = TC->getType();
2913   // This is where we can make the step a runtime constant.
2914   Value *Step = createStepForVF(Builder, Ty, VF, UF);
2915 
2916   // If the tail is to be folded by masking, round the number of iterations N
2917   // up to a multiple of Step instead of rounding down. This is done by first
2918   // adding Step-1 and then rounding down. Note that it's ok if this addition
2919   // overflows: the vector induction variable will eventually wrap to zero given
2920   // that it starts at zero and its Step is a power of two; the loop will then
2921   // exit, with the last early-exit vector comparison also producing all-true.
2922   if (Cost->foldTailByMasking()) {
2923     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2924            "VF*UF must be a power of 2 when folding tail by masking");
2925     Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF);
2926     TC = Builder.CreateAdd(
2927         TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up");
2928   }
2929 
2930   // Now we need to generate the expression for the part of the loop that the
2931   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2932   // iterations are not required for correctness, or N - Step, otherwise. Step
2933   // is equal to the vectorization factor (number of SIMD elements) times the
2934   // unroll factor (number of SIMD instructions).
2935   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2936 
2937   // There are cases where we *must* run at least one iteration in the remainder
2938   // loop.  See the cost model for when this can happen.  If the step evenly
2939   // divides the trip count, we set the remainder to be equal to the step. If
2940   // the step does not evenly divide the trip count, no adjustment is necessary
2941   // since there will already be scalar iterations. Note that the minimum
2942   // iterations check ensures that N >= Step.
2943   if (Cost->requiresScalarEpilogue(VF)) {
2944     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2945     R = Builder.CreateSelect(IsZero, Step, R);
2946   }
2947 
2948   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2949 
2950   return VectorTripCount;
2951 }
2952 
2953 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2954                                                    const DataLayout &DL) {
2955   // Verify that V is a vector type with same number of elements as DstVTy.
2956   auto *DstFVTy = cast<FixedVectorType>(DstVTy);
2957   unsigned VF = DstFVTy->getNumElements();
2958   auto *SrcVecTy = cast<FixedVectorType>(V->getType());
2959   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2960   Type *SrcElemTy = SrcVecTy->getElementType();
2961   Type *DstElemTy = DstFVTy->getElementType();
2962   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2963          "Vector elements must have same size");
2964 
2965   // Do a direct cast if element types are castable.
2966   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2967     return Builder.CreateBitOrPointerCast(V, DstFVTy);
2968   }
2969   // V cannot be directly casted to desired vector type.
2970   // May happen when V is a floating point vector but DstVTy is a vector of
2971   // pointers or vice-versa. Handle this using a two-step bitcast using an
2972   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2973   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2974          "Only one type should be a pointer type");
2975   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2976          "Only one type should be a floating point type");
2977   Type *IntTy =
2978       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2979   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
2980   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2981   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
2982 }
2983 
2984 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2985                                                          BasicBlock *Bypass) {
2986   Value *Count = getOrCreateTripCount(L);
2987   // Reuse existing vector loop preheader for TC checks.
2988   // Note that new preheader block is generated for vector loop.
2989   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2990   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2991 
2992   // Generate code to check if the loop's trip count is less than VF * UF, or
2993   // equal to it in case a scalar epilogue is required; this implies that the
2994   // vector trip count is zero. This check also covers the case where adding one
2995   // to the backedge-taken count overflowed leading to an incorrect trip count
2996   // of zero. In this case we will also jump to the scalar loop.
2997   auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE
2998                                             : ICmpInst::ICMP_ULT;
2999 
3000   // If tail is to be folded, vector loop takes care of all iterations.
3001   Value *CheckMinIters = Builder.getFalse();
3002   if (!Cost->foldTailByMasking()) {
3003     Value *Step = createStepForVF(Builder, Count->getType(), VF, UF);
3004     CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
3005   }
3006   // Create new preheader for vector loop.
3007   LoopVectorPreHeader =
3008       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
3009                  "vector.ph");
3010 
3011   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
3012                                DT->getNode(Bypass)->getIDom()) &&
3013          "TC check is expected to dominate Bypass");
3014 
3015   // Update dominator for Bypass & LoopExit (if needed).
3016   DT->changeImmediateDominator(Bypass, TCCheckBlock);
3017   if (!Cost->requiresScalarEpilogue(VF))
3018     // If there is an epilogue which must run, there's no edge from the
3019     // middle block to exit blocks  and thus no need to update the immediate
3020     // dominator of the exit blocks.
3021     DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3022 
3023   ReplaceInstWithInst(
3024       TCCheckBlock->getTerminator(),
3025       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3026   LoopBypassBlocks.push_back(TCCheckBlock);
3027 }
3028 
3029 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
3030 
3031   BasicBlock *const SCEVCheckBlock =
3032       RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
3033   if (!SCEVCheckBlock)
3034     return nullptr;
3035 
3036   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
3037            (OptForSizeBasedOnProfile &&
3038             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
3039          "Cannot SCEV check stride or overflow when optimizing for size");
3040 
3041 
3042   // Update dominator only if this is first RT check.
3043   if (LoopBypassBlocks.empty()) {
3044     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3045     if (!Cost->requiresScalarEpilogue(VF))
3046       // If there is an epilogue which must run, there's no edge from the
3047       // middle block to exit blocks  and thus no need to update the immediate
3048       // dominator of the exit blocks.
3049       DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3050   }
3051 
3052   LoopBypassBlocks.push_back(SCEVCheckBlock);
3053   AddedSafetyChecks = true;
3054   return SCEVCheckBlock;
3055 }
3056 
3057 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L,
3058                                                       BasicBlock *Bypass) {
3059   // VPlan-native path does not do any analysis for runtime checks currently.
3060   if (EnableVPlanNativePath)
3061     return nullptr;
3062 
3063   BasicBlock *const MemCheckBlock =
3064       RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader);
3065 
3066   // Check if we generated code that checks in runtime if arrays overlap. We put
3067   // the checks into a separate block to make the more common case of few
3068   // elements faster.
3069   if (!MemCheckBlock)
3070     return nullptr;
3071 
3072   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3073     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
3074            "Cannot emit memory checks when optimizing for size, unless forced "
3075            "to vectorize.");
3076     ORE->emit([&]() {
3077       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
3078                                         L->getStartLoc(), L->getHeader())
3079              << "Code-size may be reduced by not forcing "
3080                 "vectorization, or by source-code modifications "
3081                 "eliminating the need for runtime checks "
3082                 "(e.g., adding 'restrict').";
3083     });
3084   }
3085 
3086   LoopBypassBlocks.push_back(MemCheckBlock);
3087 
3088   AddedSafetyChecks = true;
3089 
3090   // We currently don't use LoopVersioning for the actual loop cloning but we
3091   // still use it to add the noalias metadata.
3092   LVer = std::make_unique<LoopVersioning>(
3093       *Legal->getLAI(),
3094       Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
3095       DT, PSE.getSE());
3096   LVer->prepareNoAliasMetadata();
3097   return MemCheckBlock;
3098 }
3099 
3100 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3101   LoopScalarBody = OrigLoop->getHeader();
3102   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3103   assert(LoopVectorPreHeader && "Invalid loop structure");
3104   LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
3105   assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) &&
3106          "multiple exit loop without required epilogue?");
3107 
3108   LoopMiddleBlock =
3109       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3110                  LI, nullptr, Twine(Prefix) + "middle.block");
3111   LoopScalarPreHeader =
3112       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3113                  nullptr, Twine(Prefix) + "scalar.ph");
3114 
3115   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3116 
3117   // Set up the middle block terminator.  Two cases:
3118   // 1) If we know that we must execute the scalar epilogue, emit an
3119   //    unconditional branch.
3120   // 2) Otherwise, we must have a single unique exit block (due to how we
3121   //    implement the multiple exit case).  In this case, set up a conditonal
3122   //    branch from the middle block to the loop scalar preheader, and the
3123   //    exit block.  completeLoopSkeleton will update the condition to use an
3124   //    iteration check, if required to decide whether to execute the remainder.
3125   BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ?
3126     BranchInst::Create(LoopScalarPreHeader) :
3127     BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
3128                        Builder.getTrue());
3129   BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3130   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3131 
3132   // We intentionally don't let SplitBlock to update LoopInfo since
3133   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3134   // LoopVectorBody is explicitly added to the correct place few lines later.
3135   LoopVectorBody =
3136       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3137                  nullptr, nullptr, Twine(Prefix) + "vector.body");
3138 
3139   // Update dominator for loop exit.
3140   if (!Cost->requiresScalarEpilogue(VF))
3141     // If there is an epilogue which must run, there's no edge from the
3142     // middle block to exit blocks  and thus no need to update the immediate
3143     // dominator of the exit blocks.
3144     DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3145 
3146   // Create and register the new vector loop.
3147   Loop *Lp = LI->AllocateLoop();
3148   Loop *ParentLoop = OrigLoop->getParentLoop();
3149 
3150   // Insert the new loop into the loop nest and register the new basic blocks
3151   // before calling any utilities such as SCEV that require valid LoopInfo.
3152   if (ParentLoop) {
3153     ParentLoop->addChildLoop(Lp);
3154   } else {
3155     LI->addTopLevelLoop(Lp);
3156   }
3157   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3158   return Lp;
3159 }
3160 
3161 void InnerLoopVectorizer::createInductionResumeValues(
3162     Loop *L, std::pair<BasicBlock *, Value *> AdditionalBypass) {
3163   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3164           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3165          "Inconsistent information about additional bypass.");
3166 
3167   Value *VectorTripCount = getOrCreateVectorTripCount(L);
3168   assert(VectorTripCount && L && "Expected valid arguments");
3169   // We are going to resume the execution of the scalar loop.
3170   // Go over all of the induction variables that we found and fix the
3171   // PHIs that are left in the scalar version of the loop.
3172   // The starting values of PHI nodes depend on the counter of the last
3173   // iteration in the vectorized loop.
3174   // If we come from a bypass edge then we need to start from the original
3175   // start value.
3176   Instruction *OldInduction = Legal->getPrimaryInduction();
3177   for (auto &InductionEntry : Legal->getInductionVars()) {
3178     PHINode *OrigPhi = InductionEntry.first;
3179     InductionDescriptor II = InductionEntry.second;
3180 
3181     // Create phi nodes to merge from the  backedge-taken check block.
3182     PHINode *BCResumeVal =
3183         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3184                         LoopScalarPreHeader->getTerminator());
3185     // Copy original phi DL over to the new one.
3186     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3187     Value *&EndValue = IVEndValues[OrigPhi];
3188     Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3189     if (OrigPhi == OldInduction) {
3190       // We know what the end value is.
3191       EndValue = VectorTripCount;
3192     } else {
3193       IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3194 
3195       // Fast-math-flags propagate from the original induction instruction.
3196       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3197         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3198 
3199       Type *StepType = II.getStep()->getType();
3200       Instruction::CastOps CastOp =
3201           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3202       Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3203       Value *Step =
3204           CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
3205       EndValue = emitTransformedIndex(B, CRD, II.getStartValue(), Step, II);
3206       EndValue->setName("ind.end");
3207 
3208       // Compute the end value for the additional bypass (if applicable).
3209       if (AdditionalBypass.first) {
3210         B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3211         CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
3212                                          StepType, true);
3213         Value *Step =
3214             CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
3215         CRD =
3216             B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
3217         EndValueFromAdditionalBypass =
3218             emitTransformedIndex(B, CRD, II.getStartValue(), Step, II);
3219         EndValueFromAdditionalBypass->setName("ind.end");
3220       }
3221     }
3222     // The new PHI merges the original incoming value, in case of a bypass,
3223     // or the value at the end of the vectorized loop.
3224     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3225 
3226     // Fix the scalar body counter (PHI node).
3227     // The old induction's phi node in the scalar body needs the truncated
3228     // value.
3229     for (BasicBlock *BB : LoopBypassBlocks)
3230       BCResumeVal->addIncoming(II.getStartValue(), BB);
3231 
3232     if (AdditionalBypass.first)
3233       BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3234                                             EndValueFromAdditionalBypass);
3235 
3236     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3237   }
3238 }
3239 
3240 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3241                                                       MDNode *OrigLoopID) {
3242   assert(L && "Expected valid loop.");
3243 
3244   // The trip counts should be cached by now.
3245   Value *Count = getOrCreateTripCount(L);
3246   Value *VectorTripCount = getOrCreateVectorTripCount(L);
3247 
3248   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3249 
3250   // Add a check in the middle block to see if we have completed
3251   // all of the iterations in the first vector loop.  Three cases:
3252   // 1) If we require a scalar epilogue, there is no conditional branch as
3253   //    we unconditionally branch to the scalar preheader.  Do nothing.
3254   // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3255   //    Thus if tail is to be folded, we know we don't need to run the
3256   //    remainder and we can use the previous value for the condition (true).
3257   // 3) Otherwise, construct a runtime check.
3258   if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) {
3259     Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3260                                         Count, VectorTripCount, "cmp.n",
3261                                         LoopMiddleBlock->getTerminator());
3262 
3263     // Here we use the same DebugLoc as the scalar loop latch terminator instead
3264     // of the corresponding compare because they may have ended up with
3265     // different line numbers and we want to avoid awkward line stepping while
3266     // debugging. Eg. if the compare has got a line number inside the loop.
3267     CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3268     cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3269   }
3270 
3271   // Get ready to start creating new instructions into the vectorized body.
3272   assert(LoopVectorPreHeader == L->getLoopPreheader() &&
3273          "Inconsistent vector loop preheader");
3274 
3275 #ifdef EXPENSIVE_CHECKS
3276   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3277   LI->verify(*DT);
3278 #endif
3279 
3280   return LoopVectorPreHeader;
3281 }
3282 
3283 std::pair<BasicBlock *, Value *>
3284 InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3285   /*
3286    In this function we generate a new loop. The new loop will contain
3287    the vectorized instructions while the old loop will continue to run the
3288    scalar remainder.
3289 
3290        [ ] <-- loop iteration number check.
3291     /   |
3292    /    v
3293   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3294   |  /  |
3295   | /   v
3296   ||   [ ]     <-- vector pre header.
3297   |/    |
3298   |     v
3299   |    [  ] \
3300   |    [  ]_|   <-- vector loop.
3301   |     |
3302   |     v
3303   \   -[ ]   <--- middle-block.
3304    \/   |
3305    /\   v
3306    | ->[ ]     <--- new preheader.
3307    |    |
3308  (opt)  v      <-- edge from middle to exit iff epilogue is not required.
3309    |   [ ] \
3310    |   [ ]_|   <-- old scalar loop to handle remainder (scalar epilogue).
3311     \   |
3312      \  v
3313       >[ ]     <-- exit block(s).
3314    ...
3315    */
3316 
3317   // Get the metadata of the original loop before it gets modified.
3318   MDNode *OrigLoopID = OrigLoop->getLoopID();
3319 
3320   // Workaround!  Compute the trip count of the original loop and cache it
3321   // before we start modifying the CFG.  This code has a systemic problem
3322   // wherein it tries to run analysis over partially constructed IR; this is
3323   // wrong, and not simply for SCEV.  The trip count of the original loop
3324   // simply happens to be prone to hitting this in practice.  In theory, we
3325   // can hit the same issue for any SCEV, or ValueTracking query done during
3326   // mutation.  See PR49900.
3327   getOrCreateTripCount(OrigLoop);
3328 
3329   // Create an empty vector loop, and prepare basic blocks for the runtime
3330   // checks.
3331   Loop *Lp = createVectorLoopSkeleton("");
3332 
3333   // Now, compare the new count to zero. If it is zero skip the vector loop and
3334   // jump to the scalar loop. This check also covers the case where the
3335   // backedge-taken count is uint##_max: adding one to it will overflow leading
3336   // to an incorrect trip count of zero. In this (rare) case we will also jump
3337   // to the scalar loop.
3338   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3339 
3340   // Generate the code to check any assumptions that we've made for SCEV
3341   // expressions.
3342   emitSCEVChecks(LoopScalarPreHeader);
3343 
3344   // Generate the code that checks in runtime if arrays overlap. We put the
3345   // checks into a separate block to make the more common case of few elements
3346   // faster.
3347   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3348 
3349   createHeaderBranch(Lp);
3350 
3351   // Emit phis for the new starting index of the scalar loop.
3352   createInductionResumeValues(Lp);
3353 
3354   return {completeLoopSkeleton(Lp, OrigLoopID), nullptr};
3355 }
3356 
3357 // Fix up external users of the induction variable. At this point, we are
3358 // in LCSSA form, with all external PHIs that use the IV having one input value,
3359 // coming from the remainder loop. We need those PHIs to also have a correct
3360 // value for the IV when arriving directly from the middle block.
3361 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3362                                        const InductionDescriptor &II,
3363                                        Value *CountRoundDown, Value *EndValue,
3364                                        BasicBlock *MiddleBlock) {
3365   // There are two kinds of external IV usages - those that use the value
3366   // computed in the last iteration (the PHI) and those that use the penultimate
3367   // value (the value that feeds into the phi from the loop latch).
3368   // We allow both, but they, obviously, have different values.
3369 
3370   assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3371 
3372   DenseMap<Value *, Value *> MissingVals;
3373 
3374   // An external user of the last iteration's value should see the value that
3375   // the remainder loop uses to initialize its own IV.
3376   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3377   for (User *U : PostInc->users()) {
3378     Instruction *UI = cast<Instruction>(U);
3379     if (!OrigLoop->contains(UI)) {
3380       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3381       MissingVals[UI] = EndValue;
3382     }
3383   }
3384 
3385   // An external user of the penultimate value need to see EndValue - Step.
3386   // The simplest way to get this is to recompute it from the constituent SCEVs,
3387   // that is Start + (Step * (CRD - 1)).
3388   for (User *U : OrigPhi->users()) {
3389     auto *UI = cast<Instruction>(U);
3390     if (!OrigLoop->contains(UI)) {
3391       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3392 
3393       IRBuilder<> B(MiddleBlock->getTerminator());
3394 
3395       // Fast-math-flags propagate from the original induction instruction.
3396       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3397         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3398 
3399       Value *CountMinusOne = B.CreateSub(
3400           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3401       Value *CMO =
3402           !II.getStep()->getType()->isIntegerTy()
3403               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3404                              II.getStep()->getType())
3405               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3406       CMO->setName("cast.cmo");
3407 
3408       Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(),
3409                                     LoopVectorBody->getTerminator());
3410       Value *Escape =
3411           emitTransformedIndex(B, CMO, II.getStartValue(), Step, II);
3412       Escape->setName("ind.escape");
3413       MissingVals[UI] = Escape;
3414     }
3415   }
3416 
3417   for (auto &I : MissingVals) {
3418     PHINode *PHI = cast<PHINode>(I.first);
3419     // One corner case we have to handle is two IVs "chasing" each-other,
3420     // that is %IV2 = phi [...], [ %IV1, %latch ]
3421     // In this case, if IV1 has an external use, we need to avoid adding both
3422     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3423     // don't already have an incoming value for the middle block.
3424     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3425       PHI->addIncoming(I.second, MiddleBlock);
3426   }
3427 }
3428 
3429 namespace {
3430 
3431 struct CSEDenseMapInfo {
3432   static bool canHandle(const Instruction *I) {
3433     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3434            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3435   }
3436 
3437   static inline Instruction *getEmptyKey() {
3438     return DenseMapInfo<Instruction *>::getEmptyKey();
3439   }
3440 
3441   static inline Instruction *getTombstoneKey() {
3442     return DenseMapInfo<Instruction *>::getTombstoneKey();
3443   }
3444 
3445   static unsigned getHashValue(const Instruction *I) {
3446     assert(canHandle(I) && "Unknown instruction!");
3447     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3448                                                            I->value_op_end()));
3449   }
3450 
3451   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3452     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3453         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3454       return LHS == RHS;
3455     return LHS->isIdenticalTo(RHS);
3456   }
3457 };
3458 
3459 } // end anonymous namespace
3460 
3461 ///Perform cse of induction variable instructions.
3462 static void cse(BasicBlock *BB) {
3463   // Perform simple cse.
3464   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3465   for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3466     if (!CSEDenseMapInfo::canHandle(&In))
3467       continue;
3468 
3469     // Check if we can replace this instruction with any of the
3470     // visited instructions.
3471     if (Instruction *V = CSEMap.lookup(&In)) {
3472       In.replaceAllUsesWith(V);
3473       In.eraseFromParent();
3474       continue;
3475     }
3476 
3477     CSEMap[&In] = &In;
3478   }
3479 }
3480 
3481 InstructionCost
3482 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
3483                                               bool &NeedToScalarize) const {
3484   Function *F = CI->getCalledFunction();
3485   Type *ScalarRetTy = CI->getType();
3486   SmallVector<Type *, 4> Tys, ScalarTys;
3487   for (auto &ArgOp : CI->args())
3488     ScalarTys.push_back(ArgOp->getType());
3489 
3490   // Estimate cost of scalarized vector call. The source operands are assumed
3491   // to be vectors, so we need to extract individual elements from there,
3492   // execute VF scalar calls, and then gather the result into the vector return
3493   // value.
3494   InstructionCost ScalarCallCost =
3495       TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
3496   if (VF.isScalar())
3497     return ScalarCallCost;
3498 
3499   // Compute corresponding vector type for return value and arguments.
3500   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3501   for (Type *ScalarTy : ScalarTys)
3502     Tys.push_back(ToVectorTy(ScalarTy, VF));
3503 
3504   // Compute costs of unpacking argument values for the scalar calls and
3505   // packing the return values to a vector.
3506   InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
3507 
3508   InstructionCost Cost =
3509       ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3510 
3511   // If we can't emit a vector call for this function, then the currently found
3512   // cost is the cost we need to return.
3513   NeedToScalarize = true;
3514   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3515   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3516 
3517   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3518     return Cost;
3519 
3520   // If the corresponding vector cost is cheaper, return its cost.
3521   InstructionCost VectorCallCost =
3522       TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
3523   if (VectorCallCost < Cost) {
3524     NeedToScalarize = false;
3525     Cost = VectorCallCost;
3526   }
3527   return Cost;
3528 }
3529 
3530 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3531   if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3532     return Elt;
3533   return VectorType::get(Elt, VF);
3534 }
3535 
3536 InstructionCost
3537 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3538                                                    ElementCount VF) const {
3539   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3540   assert(ID && "Expected intrinsic call!");
3541   Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3542   FastMathFlags FMF;
3543   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3544     FMF = FPMO->getFastMathFlags();
3545 
3546   SmallVector<const Value *> Arguments(CI->args());
3547   FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3548   SmallVector<Type *> ParamTys;
3549   std::transform(FTy->param_begin(), FTy->param_end(),
3550                  std::back_inserter(ParamTys),
3551                  [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3552 
3553   IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3554                                     dyn_cast<IntrinsicInst>(CI));
3555   return TTI.getIntrinsicInstrCost(CostAttrs,
3556                                    TargetTransformInfo::TCK_RecipThroughput);
3557 }
3558 
3559 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3560   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3561   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3562   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3563 }
3564 
3565 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3566   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3567   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3568   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3569 }
3570 
3571 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
3572   // For every instruction `I` in MinBWs, truncate the operands, create a
3573   // truncated version of `I` and reextend its result. InstCombine runs
3574   // later and will remove any ext/trunc pairs.
3575   SmallPtrSet<Value *, 4> Erased;
3576   for (const auto &KV : Cost->getMinimalBitwidths()) {
3577     // If the value wasn't vectorized, we must maintain the original scalar
3578     // type. The absence of the value from State indicates that it
3579     // wasn't vectorized.
3580     // FIXME: Should not rely on getVPValue at this point.
3581     VPValue *Def = State.Plan->getVPValue(KV.first, true);
3582     if (!State.hasAnyVectorValue(Def))
3583       continue;
3584     for (unsigned Part = 0; Part < UF; ++Part) {
3585       Value *I = State.get(Def, Part);
3586       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3587         continue;
3588       Type *OriginalTy = I->getType();
3589       Type *ScalarTruncatedTy =
3590           IntegerType::get(OriginalTy->getContext(), KV.second);
3591       auto *TruncatedTy = VectorType::get(
3592           ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount());
3593       if (TruncatedTy == OriginalTy)
3594         continue;
3595 
3596       IRBuilder<> B(cast<Instruction>(I));
3597       auto ShrinkOperand = [&](Value *V) -> Value * {
3598         if (auto *ZI = dyn_cast<ZExtInst>(V))
3599           if (ZI->getSrcTy() == TruncatedTy)
3600             return ZI->getOperand(0);
3601         return B.CreateZExtOrTrunc(V, TruncatedTy);
3602       };
3603 
3604       // The actual instruction modification depends on the instruction type,
3605       // unfortunately.
3606       Value *NewI = nullptr;
3607       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3608         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3609                              ShrinkOperand(BO->getOperand(1)));
3610 
3611         // Any wrapping introduced by shrinking this operation shouldn't be
3612         // considered undefined behavior. So, we can't unconditionally copy
3613         // arithmetic wrapping flags to NewI.
3614         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3615       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3616         NewI =
3617             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3618                          ShrinkOperand(CI->getOperand(1)));
3619       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3620         NewI = B.CreateSelect(SI->getCondition(),
3621                               ShrinkOperand(SI->getTrueValue()),
3622                               ShrinkOperand(SI->getFalseValue()));
3623       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3624         switch (CI->getOpcode()) {
3625         default:
3626           llvm_unreachable("Unhandled cast!");
3627         case Instruction::Trunc:
3628           NewI = ShrinkOperand(CI->getOperand(0));
3629           break;
3630         case Instruction::SExt:
3631           NewI = B.CreateSExtOrTrunc(
3632               CI->getOperand(0),
3633               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3634           break;
3635         case Instruction::ZExt:
3636           NewI = B.CreateZExtOrTrunc(
3637               CI->getOperand(0),
3638               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3639           break;
3640         }
3641       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3642         auto Elements0 =
3643             cast<VectorType>(SI->getOperand(0)->getType())->getElementCount();
3644         auto *O0 = B.CreateZExtOrTrunc(
3645             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3646         auto Elements1 =
3647             cast<VectorType>(SI->getOperand(1)->getType())->getElementCount();
3648         auto *O1 = B.CreateZExtOrTrunc(
3649             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3650 
3651         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3652       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3653         // Don't do anything with the operands, just extend the result.
3654         continue;
3655       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3656         auto Elements =
3657             cast<VectorType>(IE->getOperand(0)->getType())->getElementCount();
3658         auto *O0 = B.CreateZExtOrTrunc(
3659             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3660         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3661         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3662       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3663         auto Elements =
3664             cast<VectorType>(EE->getOperand(0)->getType())->getElementCount();
3665         auto *O0 = B.CreateZExtOrTrunc(
3666             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3667         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3668       } else {
3669         // If we don't know what to do, be conservative and don't do anything.
3670         continue;
3671       }
3672 
3673       // Lastly, extend the result.
3674       NewI->takeName(cast<Instruction>(I));
3675       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3676       I->replaceAllUsesWith(Res);
3677       cast<Instruction>(I)->eraseFromParent();
3678       Erased.insert(I);
3679       State.reset(Def, Res, Part);
3680     }
3681   }
3682 
3683   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3684   for (const auto &KV : Cost->getMinimalBitwidths()) {
3685     // If the value wasn't vectorized, we must maintain the original scalar
3686     // type. The absence of the value from State indicates that it
3687     // wasn't vectorized.
3688     // FIXME: Should not rely on getVPValue at this point.
3689     VPValue *Def = State.Plan->getVPValue(KV.first, true);
3690     if (!State.hasAnyVectorValue(Def))
3691       continue;
3692     for (unsigned Part = 0; Part < UF; ++Part) {
3693       Value *I = State.get(Def, Part);
3694       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3695       if (Inst && Inst->use_empty()) {
3696         Value *NewI = Inst->getOperand(0);
3697         Inst->eraseFromParent();
3698         State.reset(Def, NewI, Part);
3699       }
3700     }
3701   }
3702 }
3703 
3704 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
3705   // Insert truncates and extends for any truncated instructions as hints to
3706   // InstCombine.
3707   if (VF.isVector())
3708     truncateToMinimalBitwidths(State);
3709 
3710   // Fix widened non-induction PHIs by setting up the PHI operands.
3711   if (OrigPHIsToFix.size()) {
3712     assert(EnableVPlanNativePath &&
3713            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3714     fixNonInductionPHIs(State);
3715   }
3716 
3717   // At this point every instruction in the original loop is widened to a
3718   // vector form. Now we need to fix the recurrences in the loop. These PHI
3719   // nodes are currently empty because we did not want to introduce cycles.
3720   // This is the second stage of vectorizing recurrences.
3721   fixCrossIterationPHIs(State);
3722 
3723   // Forget the original basic block.
3724   PSE.getSE()->forgetLoop(OrigLoop);
3725 
3726   // If we inserted an edge from the middle block to the unique exit block,
3727   // update uses outside the loop (phis) to account for the newly inserted
3728   // edge.
3729   if (!Cost->requiresScalarEpilogue(VF)) {
3730     // Fix-up external users of the induction variables.
3731     for (auto &Entry : Legal->getInductionVars())
3732       fixupIVUsers(Entry.first, Entry.second,
3733                    getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3734                    IVEndValues[Entry.first], LoopMiddleBlock);
3735 
3736     fixLCSSAPHIs(State);
3737   }
3738 
3739   for (Instruction *PI : PredicatedInstructions)
3740     sinkScalarOperands(&*PI);
3741 
3742   // Remove redundant induction instructions.
3743   cse(LoopVectorBody);
3744 
3745   // Set/update profile weights for the vector and remainder loops as original
3746   // loop iterations are now distributed among them. Note that original loop
3747   // represented by LoopScalarBody becomes remainder loop after vectorization.
3748   //
3749   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3750   // end up getting slightly roughened result but that should be OK since
3751   // profile is not inherently precise anyway. Note also possible bypass of
3752   // vector code caused by legality checks is ignored, assigning all the weight
3753   // to the vector loop, optimistically.
3754   //
3755   // For scalable vectorization we can't know at compile time how many iterations
3756   // of the loop are handled in one vector iteration, so instead assume a pessimistic
3757   // vscale of '1'.
3758   setProfileInfoAfterUnrolling(
3759       LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
3760       LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
3761 }
3762 
3763 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
3764   // In order to support recurrences we need to be able to vectorize Phi nodes.
3765   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3766   // stage #2: We now need to fix the recurrences by adding incoming edges to
3767   // the currently empty PHI nodes. At this point every instruction in the
3768   // original loop is widened to a vector form so we can use them to construct
3769   // the incoming edges.
3770   VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock();
3771   for (VPRecipeBase &R : Header->phis()) {
3772     if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
3773       fixReduction(ReductionPhi, State);
3774     else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
3775       fixFirstOrderRecurrence(FOR, State);
3776   }
3777 }
3778 
3779 void InnerLoopVectorizer::fixFirstOrderRecurrence(
3780     VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {
3781   // This is the second phase of vectorizing first-order recurrences. An
3782   // overview of the transformation is described below. Suppose we have the
3783   // following loop.
3784   //
3785   //   for (int i = 0; i < n; ++i)
3786   //     b[i] = a[i] - a[i - 1];
3787   //
3788   // There is a first-order recurrence on "a". For this loop, the shorthand
3789   // scalar IR looks like:
3790   //
3791   //   scalar.ph:
3792   //     s_init = a[-1]
3793   //     br scalar.body
3794   //
3795   //   scalar.body:
3796   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3797   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3798   //     s2 = a[i]
3799   //     b[i] = s2 - s1
3800   //     br cond, scalar.body, ...
3801   //
3802   // In this example, s1 is a recurrence because it's value depends on the
3803   // previous iteration. In the first phase of vectorization, we created a
3804   // vector phi v1 for s1. We now complete the vectorization and produce the
3805   // shorthand vector IR shown below (for VF = 4, UF = 1).
3806   //
3807   //   vector.ph:
3808   //     v_init = vector(..., ..., ..., a[-1])
3809   //     br vector.body
3810   //
3811   //   vector.body
3812   //     i = phi [0, vector.ph], [i+4, vector.body]
3813   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3814   //     v2 = a[i, i+1, i+2, i+3];
3815   //     v3 = vector(v1(3), v2(0, 1, 2))
3816   //     b[i, i+1, i+2, i+3] = v2 - v3
3817   //     br cond, vector.body, middle.block
3818   //
3819   //   middle.block:
3820   //     x = v2(3)
3821   //     br scalar.ph
3822   //
3823   //   scalar.ph:
3824   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3825   //     br scalar.body
3826   //
3827   // After execution completes the vector loop, we extract the next value of
3828   // the recurrence (x) to use as the initial value in the scalar loop.
3829 
3830   // Extract the last vector element in the middle block. This will be the
3831   // initial value for the recurrence when jumping to the scalar loop.
3832   VPValue *PreviousDef = PhiR->getBackedgeValue();
3833   Value *Incoming = State.get(PreviousDef, UF - 1);
3834   auto *ExtractForScalar = Incoming;
3835   auto *IdxTy = Builder.getInt32Ty();
3836   if (VF.isVector()) {
3837     auto *One = ConstantInt::get(IdxTy, 1);
3838     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3839     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3840     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
3841     ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx,
3842                                                     "vector.recur.extract");
3843   }
3844   // Extract the second last element in the middle block if the
3845   // Phi is used outside the loop. We need to extract the phi itself
3846   // and not the last element (the phi update in the current iteration). This
3847   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3848   // when the scalar loop is not run at all.
3849   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3850   if (VF.isVector()) {
3851     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3852     auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
3853     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3854         Incoming, Idx, "vector.recur.extract.for.phi");
3855   } else if (UF > 1)
3856     // When loop is unrolled without vectorizing, initialize
3857     // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value
3858     // of `Incoming`. This is analogous to the vectorized case above: extracting
3859     // the second last element when VF > 1.
3860     ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
3861 
3862   // Fix the initial value of the original recurrence in the scalar loop.
3863   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3864   PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
3865   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3866   auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
3867   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3868     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3869     Start->addIncoming(Incoming, BB);
3870   }
3871 
3872   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3873   Phi->setName("scalar.recur");
3874 
3875   // Finally, fix users of the recurrence outside the loop. The users will need
3876   // either the last value of the scalar recurrence or the last value of the
3877   // vector recurrence we extracted in the middle block. Since the loop is in
3878   // LCSSA form, we just need to find all the phi nodes for the original scalar
3879   // recurrence in the exit block, and then add an edge for the middle block.
3880   // Note that LCSSA does not imply single entry when the original scalar loop
3881   // had multiple exiting edges (as we always run the last iteration in the
3882   // scalar epilogue); in that case, there is no edge from middle to exit and
3883   // and thus no phis which needed updated.
3884   if (!Cost->requiresScalarEpilogue(VF))
3885     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
3886       if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi))
3887         LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3888 }
3889 
3890 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
3891                                        VPTransformState &State) {
3892   PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
3893   // Get it's reduction variable descriptor.
3894   assert(Legal->isReductionVariable(OrigPhi) &&
3895          "Unable to find the reduction variable");
3896   const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
3897 
3898   RecurKind RK = RdxDesc.getRecurrenceKind();
3899   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3900   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3901   setDebugLocFromInst(ReductionStartValue);
3902 
3903   VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
3904   // This is the vector-clone of the value that leaves the loop.
3905   Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
3906 
3907   // Wrap flags are in general invalid after vectorization, clear them.
3908   clearReductionWrapFlags(RdxDesc, State);
3909 
3910   // Before each round, move the insertion point right between
3911   // the PHIs and the values we are going to write.
3912   // This allows us to write both PHINodes and the extractelement
3913   // instructions.
3914   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3915 
3916   setDebugLocFromInst(LoopExitInst);
3917 
3918   Type *PhiTy = OrigPhi->getType();
3919   // If tail is folded by masking, the vector value to leave the loop should be
3920   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3921   // instead of the former. For an inloop reduction the reduction will already
3922   // be predicated, and does not need to be handled here.
3923   if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
3924     for (unsigned Part = 0; Part < UF; ++Part) {
3925       Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
3926       Value *Sel = nullptr;
3927       for (User *U : VecLoopExitInst->users()) {
3928         if (isa<SelectInst>(U)) {
3929           assert(!Sel && "Reduction exit feeding two selects");
3930           Sel = U;
3931         } else
3932           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3933       }
3934       assert(Sel && "Reduction exit feeds no select");
3935       State.reset(LoopExitInstDef, Sel, Part);
3936 
3937       // If the target can create a predicated operator for the reduction at no
3938       // extra cost in the loop (for example a predicated vadd), it can be
3939       // cheaper for the select to remain in the loop than be sunk out of it,
3940       // and so use the select value for the phi instead of the old
3941       // LoopExitValue.
3942       if (PreferPredicatedReductionSelect ||
3943           TTI->preferPredicatedReductionSelect(
3944               RdxDesc.getOpcode(), PhiTy,
3945               TargetTransformInfo::ReductionFlags())) {
3946         auto *VecRdxPhi =
3947             cast<PHINode>(State.get(PhiR, Part));
3948         VecRdxPhi->setIncomingValueForBlock(
3949             LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
3950       }
3951     }
3952   }
3953 
3954   // If the vector reduction can be performed in a smaller type, we truncate
3955   // then extend the loop exit value to enable InstCombine to evaluate the
3956   // entire expression in the smaller type.
3957   if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
3958     assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
3959     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3960     Builder.SetInsertPoint(
3961         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
3962     VectorParts RdxParts(UF);
3963     for (unsigned Part = 0; Part < UF; ++Part) {
3964       RdxParts[Part] = State.get(LoopExitInstDef, Part);
3965       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3966       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3967                                         : Builder.CreateZExt(Trunc, VecTy);
3968       for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users()))
3969         if (U != Trunc) {
3970           U->replaceUsesOfWith(RdxParts[Part], Extnd);
3971           RdxParts[Part] = Extnd;
3972         }
3973     }
3974     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3975     for (unsigned Part = 0; Part < UF; ++Part) {
3976       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3977       State.reset(LoopExitInstDef, RdxParts[Part], Part);
3978     }
3979   }
3980 
3981   // Reduce all of the unrolled parts into a single vector.
3982   Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
3983   unsigned Op = RecurrenceDescriptor::getOpcode(RK);
3984 
3985   // The middle block terminator has already been assigned a DebugLoc here (the
3986   // OrigLoop's single latch terminator). We want the whole middle block to
3987   // appear to execute on this line because: (a) it is all compiler generated,
3988   // (b) these instructions are always executed after evaluating the latch
3989   // conditional branch, and (c) other passes may add new predecessors which
3990   // terminate on this line. This is the easiest way to ensure we don't
3991   // accidentally cause an extra step back into the loop while debugging.
3992   setDebugLocFromInst(LoopMiddleBlock->getTerminator());
3993   if (PhiR->isOrdered())
3994     ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
3995   else {
3996     // Floating-point operations should have some FMF to enable the reduction.
3997     IRBuilderBase::FastMathFlagGuard FMFG(Builder);
3998     Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
3999     for (unsigned Part = 1; Part < UF; ++Part) {
4000       Value *RdxPart = State.get(LoopExitInstDef, Part);
4001       if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
4002         ReducedPartRdx = Builder.CreateBinOp(
4003             (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
4004       } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK))
4005         ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK,
4006                                            ReducedPartRdx, RdxPart);
4007       else
4008         ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
4009     }
4010   }
4011 
4012   // Create the reduction after the loop. Note that inloop reductions create the
4013   // target reduction in the loop using a Reduction recipe.
4014   if (VF.isVector() && !PhiR->isInLoop()) {
4015     ReducedPartRdx =
4016         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi);
4017     // If the reduction can be performed in a smaller type, we need to extend
4018     // the reduction to the wider type before we branch to the original loop.
4019     if (PhiTy != RdxDesc.getRecurrenceType())
4020       ReducedPartRdx = RdxDesc.isSigned()
4021                            ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
4022                            : Builder.CreateZExt(ReducedPartRdx, PhiTy);
4023   }
4024 
4025   PHINode *ResumePhi =
4026       dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
4027 
4028   // Create a phi node that merges control-flow from the backedge-taken check
4029   // block and the middle block.
4030   PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
4031                                         LoopScalarPreHeader->getTerminator());
4032 
4033   // If we are fixing reductions in the epilogue loop then we should already
4034   // have created a bc.merge.rdx Phi after the main vector body. Ensure that
4035   // we carry over the incoming values correctly.
4036   for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
4037     if (Incoming == LoopMiddleBlock)
4038       BCBlockPhi->addIncoming(ReducedPartRdx, Incoming);
4039     else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming))
4040       BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
4041                               Incoming);
4042     else
4043       BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
4044   }
4045 
4046   // Set the resume value for this reduction
4047   ReductionResumeValues.insert({&RdxDesc, BCBlockPhi});
4048 
4049   // Now, we need to fix the users of the reduction variable
4050   // inside and outside of the scalar remainder loop.
4051 
4052   // We know that the loop is in LCSSA form. We need to update the PHI nodes
4053   // in the exit blocks.  See comment on analogous loop in
4054   // fixFirstOrderRecurrence for a more complete explaination of the logic.
4055   if (!Cost->requiresScalarEpilogue(VF))
4056     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4057       if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst))
4058         LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4059 
4060   // Fix the scalar loop reduction variable with the incoming reduction sum
4061   // from the vector body and from the backedge value.
4062   int IncomingEdgeBlockIdx =
4063       OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4064   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4065   // Pick the other block.
4066   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4067   OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4068   OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4069 }
4070 
4071 void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
4072                                                   VPTransformState &State) {
4073   RecurKind RK = RdxDesc.getRecurrenceKind();
4074   if (RK != RecurKind::Add && RK != RecurKind::Mul)
4075     return;
4076 
4077   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4078   assert(LoopExitInstr && "null loop exit instruction");
4079   SmallVector<Instruction *, 8> Worklist;
4080   SmallPtrSet<Instruction *, 8> Visited;
4081   Worklist.push_back(LoopExitInstr);
4082   Visited.insert(LoopExitInstr);
4083 
4084   while (!Worklist.empty()) {
4085     Instruction *Cur = Worklist.pop_back_val();
4086     if (isa<OverflowingBinaryOperator>(Cur))
4087       for (unsigned Part = 0; Part < UF; ++Part) {
4088         // FIXME: Should not rely on getVPValue at this point.
4089         Value *V = State.get(State.Plan->getVPValue(Cur, true), Part);
4090         cast<Instruction>(V)->dropPoisonGeneratingFlags();
4091       }
4092 
4093     for (User *U : Cur->users()) {
4094       Instruction *UI = cast<Instruction>(U);
4095       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4096           Visited.insert(UI).second)
4097         Worklist.push_back(UI);
4098     }
4099   }
4100 }
4101 
4102 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) {
4103   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4104     if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1)
4105       // Some phis were already hand updated by the reduction and recurrence
4106       // code above, leave them alone.
4107       continue;
4108 
4109     auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4110     // Non-instruction incoming values will have only one value.
4111 
4112     VPLane Lane = VPLane::getFirstLane();
4113     if (isa<Instruction>(IncomingValue) &&
4114         !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue),
4115                                            VF))
4116       Lane = VPLane::getLastLaneForVF(VF);
4117 
4118     // Can be a loop invariant incoming value or the last scalar value to be
4119     // extracted from the vectorized loop.
4120     // FIXME: Should not rely on getVPValue at this point.
4121     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4122     Value *lastIncomingValue =
4123         OrigLoop->isLoopInvariant(IncomingValue)
4124             ? IncomingValue
4125             : State.get(State.Plan->getVPValue(IncomingValue, true),
4126                         VPIteration(UF - 1, Lane));
4127     LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4128   }
4129 }
4130 
4131 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4132   // The basic block and loop containing the predicated instruction.
4133   auto *PredBB = PredInst->getParent();
4134   auto *VectorLoop = LI->getLoopFor(PredBB);
4135 
4136   // Initialize a worklist with the operands of the predicated instruction.
4137   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4138 
4139   // Holds instructions that we need to analyze again. An instruction may be
4140   // reanalyzed if we don't yet know if we can sink it or not.
4141   SmallVector<Instruction *, 8> InstsToReanalyze;
4142 
4143   // Returns true if a given use occurs in the predicated block. Phi nodes use
4144   // their operands in their corresponding predecessor blocks.
4145   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4146     auto *I = cast<Instruction>(U.getUser());
4147     BasicBlock *BB = I->getParent();
4148     if (auto *Phi = dyn_cast<PHINode>(I))
4149       BB = Phi->getIncomingBlock(
4150           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4151     return BB == PredBB;
4152   };
4153 
4154   // Iteratively sink the scalarized operands of the predicated instruction
4155   // into the block we created for it. When an instruction is sunk, it's
4156   // operands are then added to the worklist. The algorithm ends after one pass
4157   // through the worklist doesn't sink a single instruction.
4158   bool Changed;
4159   do {
4160     // Add the instructions that need to be reanalyzed to the worklist, and
4161     // reset the changed indicator.
4162     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4163     InstsToReanalyze.clear();
4164     Changed = false;
4165 
4166     while (!Worklist.empty()) {
4167       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4168 
4169       // We can't sink an instruction if it is a phi node, is not in the loop,
4170       // or may have side effects.
4171       if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
4172           I->mayHaveSideEffects())
4173         continue;
4174 
4175       // If the instruction is already in PredBB, check if we can sink its
4176       // operands. In that case, VPlan's sinkScalarOperands() succeeded in
4177       // sinking the scalar instruction I, hence it appears in PredBB; but it
4178       // may have failed to sink I's operands (recursively), which we try
4179       // (again) here.
4180       if (I->getParent() == PredBB) {
4181         Worklist.insert(I->op_begin(), I->op_end());
4182         continue;
4183       }
4184 
4185       // It's legal to sink the instruction if all its uses occur in the
4186       // predicated block. Otherwise, there's nothing to do yet, and we may
4187       // need to reanalyze the instruction.
4188       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4189         InstsToReanalyze.push_back(I);
4190         continue;
4191       }
4192 
4193       // Move the instruction to the beginning of the predicated block, and add
4194       // it's operands to the worklist.
4195       I->moveBefore(&*PredBB->getFirstInsertionPt());
4196       Worklist.insert(I->op_begin(), I->op_end());
4197 
4198       // The sinking may have enabled other instructions to be sunk, so we will
4199       // need to iterate.
4200       Changed = true;
4201     }
4202   } while (Changed);
4203 }
4204 
4205 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
4206   for (PHINode *OrigPhi : OrigPHIsToFix) {
4207     VPWidenPHIRecipe *VPPhi =
4208         cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi));
4209     PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
4210     // Make sure the builder has a valid insert point.
4211     Builder.SetInsertPoint(NewPhi);
4212     for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
4213       VPValue *Inc = VPPhi->getIncomingValue(i);
4214       VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
4215       NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
4216     }
4217   }
4218 }
4219 
4220 bool InnerLoopVectorizer::useOrderedReductions(
4221     const RecurrenceDescriptor &RdxDesc) {
4222   return Cost->useOrderedReductions(RdxDesc);
4223 }
4224 
4225 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
4226                                               VPWidenPHIRecipe *PhiR,
4227                                               VPTransformState &State) {
4228   PHINode *P = cast<PHINode>(PN);
4229   if (EnableVPlanNativePath) {
4230     // Currently we enter here in the VPlan-native path for non-induction
4231     // PHIs where all control flow is uniform. We simply widen these PHIs.
4232     // Create a vector phi with no operands - the vector phi operands will be
4233     // set at the end of vector code generation.
4234     Type *VecTy = (State.VF.isScalar())
4235                       ? PN->getType()
4236                       : VectorType::get(PN->getType(), State.VF);
4237     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4238     State.set(PhiR, VecPhi, 0);
4239     OrigPHIsToFix.push_back(P);
4240 
4241     return;
4242   }
4243 
4244   assert(PN->getParent() == OrigLoop->getHeader() &&
4245          "Non-header phis should have been handled elsewhere");
4246 
4247   // In order to support recurrences we need to be able to vectorize Phi nodes.
4248   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4249   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4250   // this value when we vectorize all of the instructions that use the PHI.
4251 
4252   assert(!Legal->isReductionVariable(P) &&
4253          "reductions should be handled elsewhere");
4254 
4255   setDebugLocFromInst(P);
4256 
4257   // This PHINode must be an induction variable.
4258   // Make sure that we know about it.
4259   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4260 
4261   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4262   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4263 
4264   auto *IVR = PhiR->getParent()->getPlan()->getCanonicalIV();
4265   PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0));
4266 
4267   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4268   // which can be found from the original scalar operations.
4269   switch (II.getKind()) {
4270   case InductionDescriptor::IK_NoInduction:
4271     llvm_unreachable("Unknown induction");
4272   case InductionDescriptor::IK_IntInduction:
4273   case InductionDescriptor::IK_FpInduction:
4274     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4275   case InductionDescriptor::IK_PtrInduction: {
4276     // Handle the pointer induction variable case.
4277     assert(P->getType()->isPointerTy() && "Unexpected type.");
4278 
4279     if (Cost->isScalarAfterVectorization(P, State.VF)) {
4280       // This is the normalized GEP that starts counting at zero.
4281       Value *PtrInd =
4282           Builder.CreateSExtOrTrunc(CanonicalIV, II.getStep()->getType());
4283       // Determine the number of scalars we need to generate for each unroll
4284       // iteration. If the instruction is uniform, we only need to generate the
4285       // first lane. Otherwise, we generate all VF values.
4286       bool IsUniform = vputils::onlyFirstLaneUsed(PhiR);
4287       assert((IsUniform || !State.VF.isScalable()) &&
4288              "Cannot scalarize a scalable VF");
4289       unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
4290 
4291       for (unsigned Part = 0; Part < UF; ++Part) {
4292         Value *PartStart =
4293             createStepForVF(Builder, PtrInd->getType(), VF, Part);
4294 
4295         for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4296           Value *Idx = Builder.CreateAdd(
4297               PartStart, ConstantInt::get(PtrInd->getType(), Lane));
4298           Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4299 
4300           Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(),
4301                                         State.CFG.PrevBB->getTerminator());
4302           Value *SclrGep = emitTransformedIndex(Builder, GlobalIdx,
4303                                                 II.getStartValue(), Step, II);
4304           SclrGep->setName("next.gep");
4305           State.set(PhiR, SclrGep, VPIteration(Part, Lane));
4306         }
4307       }
4308       return;
4309     }
4310     assert(isa<SCEVConstant>(II.getStep()) &&
4311            "Induction step not a SCEV constant!");
4312     Type *PhiType = II.getStep()->getType();
4313 
4314     // Build a pointer phi
4315     Value *ScalarStartValue = PhiR->getStartValue()->getLiveInIRValue();
4316     Type *ScStValueType = ScalarStartValue->getType();
4317     PHINode *NewPointerPhi =
4318         PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);
4319     NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4320 
4321     // A pointer induction, performed by using a gep
4322     BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4323     Instruction *InductionLoc = LoopLatch->getTerminator();
4324     const SCEV *ScalarStep = II.getStep();
4325     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4326     Value *ScalarStepValue =
4327         Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4328     Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF);
4329     Value *NumUnrolledElems =
4330         Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
4331     Value *InductionGEP = GetElementPtrInst::Create(
4332         II.getElementType(), NewPointerPhi,
4333         Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
4334         InductionLoc);
4335     NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4336 
4337     // Create UF many actual address geps that use the pointer
4338     // phi as base and a vectorized version of the step value
4339     // (<step*0, ..., step*N>) as offset.
4340     for (unsigned Part = 0; Part < State.UF; ++Part) {
4341       Type *VecPhiType = VectorType::get(PhiType, State.VF);
4342       Value *StartOffsetScalar =
4343           Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
4344       Value *StartOffset =
4345           Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
4346       // Create a vector of consecutive numbers from zero to VF.
4347       StartOffset =
4348           Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType));
4349 
4350       Value *GEP = Builder.CreateGEP(
4351           II.getElementType(), NewPointerPhi,
4352           Builder.CreateMul(
4353               StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue),
4354               "vector.gep"));
4355       State.set(PhiR, GEP, Part);
4356     }
4357   }
4358   }
4359 }
4360 
4361 /// A helper function for checking whether an integer division-related
4362 /// instruction may divide by zero (in which case it must be predicated if
4363 /// executed conditionally in the scalar code).
4364 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4365 /// Non-zero divisors that are non compile-time constants will not be
4366 /// converted into multiplication, so we will still end up scalarizing
4367 /// the division, but can do so w/o predication.
4368 static bool mayDivideByZero(Instruction &I) {
4369   assert((I.getOpcode() == Instruction::UDiv ||
4370           I.getOpcode() == Instruction::SDiv ||
4371           I.getOpcode() == Instruction::URem ||
4372           I.getOpcode() == Instruction::SRem) &&
4373          "Unexpected instruction");
4374   Value *Divisor = I.getOperand(1);
4375   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4376   return !CInt || CInt->isZero();
4377 }
4378 
4379 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
4380                                                VPUser &ArgOperands,
4381                                                VPTransformState &State) {
4382   assert(!isa<DbgInfoIntrinsic>(I) &&
4383          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4384   setDebugLocFromInst(&I);
4385 
4386   Module *M = I.getParent()->getParent()->getParent();
4387   auto *CI = cast<CallInst>(&I);
4388 
4389   SmallVector<Type *, 4> Tys;
4390   for (Value *ArgOperand : CI->args())
4391     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4392 
4393   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4394 
4395   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4396   // version of the instruction.
4397   // Is it beneficial to perform intrinsic call compared to lib call?
4398   bool NeedToScalarize = false;
4399   InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4400   InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0;
4401   bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
4402   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4403          "Instruction should be scalarized elsewhere.");
4404   assert((IntrinsicCost.isValid() || CallCost.isValid()) &&
4405          "Either the intrinsic cost or vector call cost must be valid");
4406 
4407   for (unsigned Part = 0; Part < UF; ++Part) {
4408     SmallVector<Type *, 2> TysForDecl = {CI->getType()};
4409     SmallVector<Value *, 4> Args;
4410     for (auto &I : enumerate(ArgOperands.operands())) {
4411       // Some intrinsics have a scalar argument - don't replace it with a
4412       // vector.
4413       Value *Arg;
4414       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4415         Arg = State.get(I.value(), Part);
4416       else {
4417         Arg = State.get(I.value(), VPIteration(0, 0));
4418         if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index()))
4419           TysForDecl.push_back(Arg->getType());
4420       }
4421       Args.push_back(Arg);
4422     }
4423 
4424     Function *VectorF;
4425     if (UseVectorIntrinsic) {
4426       // Use vector version of the intrinsic.
4427       if (VF.isVector())
4428         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4429       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4430       assert(VectorF && "Can't retrieve vector intrinsic.");
4431     } else {
4432       // Use vector version of the function call.
4433       const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
4434 #ifndef NDEBUG
4435       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
4436              "Can't create vector function.");
4437 #endif
4438         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4439     }
4440       SmallVector<OperandBundleDef, 1> OpBundles;
4441       CI->getOperandBundlesAsDefs(OpBundles);
4442       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4443 
4444       if (isa<FPMathOperator>(V))
4445         V->copyFastMathFlags(CI);
4446 
4447       State.set(Def, V, Part);
4448       addMetadata(V, &I);
4449   }
4450 }
4451 
4452 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4453   // We should not collect Scalars more than once per VF. Right now, this
4454   // function is called from collectUniformsAndScalars(), which already does
4455   // this check. Collecting Scalars for VF=1 does not make any sense.
4456   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
4457          "This function should not be visited twice for the same VF");
4458 
4459   SmallSetVector<Instruction *, 8> Worklist;
4460 
4461   // These sets are used to seed the analysis with pointers used by memory
4462   // accesses that will remain scalar.
4463   SmallSetVector<Instruction *, 8> ScalarPtrs;
4464   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4465   auto *Latch = TheLoop->getLoopLatch();
4466 
4467   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4468   // The pointer operands of loads and stores will be scalar as long as the
4469   // memory access is not a gather or scatter operation. The value operand of a
4470   // store will remain scalar if the store is scalarized.
4471   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4472     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4473     assert(WideningDecision != CM_Unknown &&
4474            "Widening decision should be ready at this moment");
4475     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4476       if (Ptr == Store->getValueOperand())
4477         return WideningDecision == CM_Scalarize;
4478     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4479            "Ptr is neither a value or pointer operand");
4480     return WideningDecision != CM_GatherScatter;
4481   };
4482 
4483   // A helper that returns true if the given value is a bitcast or
4484   // getelementptr instruction contained in the loop.
4485   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4486     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4487             isa<GetElementPtrInst>(V)) &&
4488            !TheLoop->isLoopInvariant(V);
4489   };
4490 
4491   // A helper that evaluates a memory access's use of a pointer. If the use will
4492   // be a scalar use and the pointer is only used by memory accesses, we place
4493   // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4494   // PossibleNonScalarPtrs.
4495   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4496     // We only care about bitcast and getelementptr instructions contained in
4497     // the loop.
4498     if (!isLoopVaryingBitCastOrGEP(Ptr))
4499       return;
4500 
4501     // If the pointer has already been identified as scalar (e.g., if it was
4502     // also identified as uniform), there's nothing to do.
4503     auto *I = cast<Instruction>(Ptr);
4504     if (Worklist.count(I))
4505       return;
4506 
4507     // If the use of the pointer will be a scalar use, and all users of the
4508     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4509     // place the pointer in PossibleNonScalarPtrs.
4510     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4511           return isa<LoadInst>(U) || isa<StoreInst>(U);
4512         }))
4513       ScalarPtrs.insert(I);
4514     else
4515       PossibleNonScalarPtrs.insert(I);
4516   };
4517 
4518   // We seed the scalars analysis with three classes of instructions: (1)
4519   // instructions marked uniform-after-vectorization and (2) bitcast,
4520   // getelementptr and (pointer) phi instructions used by memory accesses
4521   // requiring a scalar use.
4522   //
4523   // (1) Add to the worklist all instructions that have been identified as
4524   // uniform-after-vectorization.
4525   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4526 
4527   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4528   // memory accesses requiring a scalar use. The pointer operands of loads and
4529   // stores will be scalar as long as the memory accesses is not a gather or
4530   // scatter operation. The value operand of a store will remain scalar if the
4531   // store is scalarized.
4532   for (auto *BB : TheLoop->blocks())
4533     for (auto &I : *BB) {
4534       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4535         evaluatePtrUse(Load, Load->getPointerOperand());
4536       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4537         evaluatePtrUse(Store, Store->getPointerOperand());
4538         evaluatePtrUse(Store, Store->getValueOperand());
4539       }
4540     }
4541   for (auto *I : ScalarPtrs)
4542     if (!PossibleNonScalarPtrs.count(I)) {
4543       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4544       Worklist.insert(I);
4545     }
4546 
4547   // Insert the forced scalars.
4548   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4549   // induction variable when the PHI user is scalarized.
4550   auto ForcedScalar = ForcedScalars.find(VF);
4551   if (ForcedScalar != ForcedScalars.end())
4552     for (auto *I : ForcedScalar->second)
4553       Worklist.insert(I);
4554 
4555   // Expand the worklist by looking through any bitcasts and getelementptr
4556   // instructions we've already identified as scalar. This is similar to the
4557   // expansion step in collectLoopUniforms(); however, here we're only
4558   // expanding to include additional bitcasts and getelementptr instructions.
4559   unsigned Idx = 0;
4560   while (Idx != Worklist.size()) {
4561     Instruction *Dst = Worklist[Idx++];
4562     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4563       continue;
4564     auto *Src = cast<Instruction>(Dst->getOperand(0));
4565     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4566           auto *J = cast<Instruction>(U);
4567           return !TheLoop->contains(J) || Worklist.count(J) ||
4568                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4569                   isScalarUse(J, Src));
4570         })) {
4571       Worklist.insert(Src);
4572       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4573     }
4574   }
4575 
4576   // An induction variable will remain scalar if all users of the induction
4577   // variable and induction variable update remain scalar.
4578   for (auto &Induction : Legal->getInductionVars()) {
4579     auto *Ind = Induction.first;
4580     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4581 
4582     // If tail-folding is applied, the primary induction variable will be used
4583     // to feed a vector compare.
4584     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4585       continue;
4586 
4587     // Returns true if \p Indvar is a pointer induction that is used directly by
4588     // load/store instruction \p I.
4589     auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
4590                                               Instruction *I) {
4591       return Induction.second.getKind() ==
4592                  InductionDescriptor::IK_PtrInduction &&
4593              (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
4594              Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
4595     };
4596 
4597     // Determine if all users of the induction variable are scalar after
4598     // vectorization.
4599     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4600       auto *I = cast<Instruction>(U);
4601       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4602              IsDirectLoadStoreFromPtrIndvar(Ind, I);
4603     });
4604     if (!ScalarInd)
4605       continue;
4606 
4607     // Determine if all users of the induction variable update instruction are
4608     // scalar after vectorization.
4609     auto ScalarIndUpdate =
4610         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4611           auto *I = cast<Instruction>(U);
4612           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4613                  IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
4614         });
4615     if (!ScalarIndUpdate)
4616       continue;
4617 
4618     // The induction variable and its update instruction will remain scalar.
4619     Worklist.insert(Ind);
4620     Worklist.insert(IndUpdate);
4621     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4622     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4623                       << "\n");
4624   }
4625 
4626   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4627 }
4628 
4629 bool LoopVectorizationCostModel::isScalarWithPredication(
4630     Instruction *I, ElementCount VF) const {
4631   if (!blockNeedsPredicationForAnyReason(I->getParent()))
4632     return false;
4633   switch(I->getOpcode()) {
4634   default:
4635     break;
4636   case Instruction::Load:
4637   case Instruction::Store: {
4638     if (!Legal->isMaskRequired(I))
4639       return false;
4640     auto *Ptr = getLoadStorePointerOperand(I);
4641     auto *Ty = getLoadStoreType(I);
4642     Type *VTy = Ty;
4643     if (VF.isVector())
4644       VTy = VectorType::get(Ty, VF);
4645     const Align Alignment = getLoadStoreAlignment(I);
4646     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4647                                 TTI.isLegalMaskedGather(VTy, Alignment))
4648                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4649                                 TTI.isLegalMaskedScatter(VTy, Alignment));
4650   }
4651   case Instruction::UDiv:
4652   case Instruction::SDiv:
4653   case Instruction::SRem:
4654   case Instruction::URem:
4655     return mayDivideByZero(*I);
4656   }
4657   return false;
4658 }
4659 
4660 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4661     Instruction *I, ElementCount VF) {
4662   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4663   assert(getWideningDecision(I, VF) == CM_Unknown &&
4664          "Decision should not be set yet.");
4665   auto *Group = getInterleavedAccessGroup(I);
4666   assert(Group && "Must have a group.");
4667 
4668   // If the instruction's allocated size doesn't equal it's type size, it
4669   // requires padding and will be scalarized.
4670   auto &DL = I->getModule()->getDataLayout();
4671   auto *ScalarTy = getLoadStoreType(I);
4672   if (hasIrregularType(ScalarTy, DL))
4673     return false;
4674 
4675   // Check if masking is required.
4676   // A Group may need masking for one of two reasons: it resides in a block that
4677   // needs predication, or it was decided to use masking to deal with gaps
4678   // (either a gap at the end of a load-access that may result in a speculative
4679   // load, or any gaps in a store-access).
4680   bool PredicatedAccessRequiresMasking =
4681       blockNeedsPredicationForAnyReason(I->getParent()) &&
4682       Legal->isMaskRequired(I);
4683   bool LoadAccessWithGapsRequiresEpilogMasking =
4684       isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
4685       !isScalarEpilogueAllowed();
4686   bool StoreAccessWithGapsRequiresMasking =
4687       isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
4688   if (!PredicatedAccessRequiresMasking &&
4689       !LoadAccessWithGapsRequiresEpilogMasking &&
4690       !StoreAccessWithGapsRequiresMasking)
4691     return true;
4692 
4693   // If masked interleaving is required, we expect that the user/target had
4694   // enabled it, because otherwise it either wouldn't have been created or
4695   // it should have been invalidated by the CostModel.
4696   assert(useMaskedInterleavedAccesses(TTI) &&
4697          "Masked interleave-groups for predicated accesses are not enabled.");
4698 
4699   if (Group->isReverse())
4700     return false;
4701 
4702   auto *Ty = getLoadStoreType(I);
4703   const Align Alignment = getLoadStoreAlignment(I);
4704   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4705                           : TTI.isLegalMaskedStore(Ty, Alignment);
4706 }
4707 
4708 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4709     Instruction *I, ElementCount VF) {
4710   // Get and ensure we have a valid memory instruction.
4711   assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
4712 
4713   auto *Ptr = getLoadStorePointerOperand(I);
4714   auto *ScalarTy = getLoadStoreType(I);
4715 
4716   // In order to be widened, the pointer should be consecutive, first of all.
4717   if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
4718     return false;
4719 
4720   // If the instruction is a store located in a predicated block, it will be
4721   // scalarized.
4722   if (isScalarWithPredication(I, VF))
4723     return false;
4724 
4725   // If the instruction's allocated size doesn't equal it's type size, it
4726   // requires padding and will be scalarized.
4727   auto &DL = I->getModule()->getDataLayout();
4728   if (hasIrregularType(ScalarTy, DL))
4729     return false;
4730 
4731   return true;
4732 }
4733 
4734 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
4735   // We should not collect Uniforms more than once per VF. Right now,
4736   // this function is called from collectUniformsAndScalars(), which
4737   // already does this check. Collecting Uniforms for VF=1 does not make any
4738   // sense.
4739 
4740   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
4741          "This function should not be visited twice for the same VF");
4742 
4743   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4744   // not analyze again.  Uniforms.count(VF) will return 1.
4745   Uniforms[VF].clear();
4746 
4747   // We now know that the loop is vectorizable!
4748   // Collect instructions inside the loop that will remain uniform after
4749   // vectorization.
4750 
4751   // Global values, params and instructions outside of current loop are out of
4752   // scope.
4753   auto isOutOfScope = [&](Value *V) -> bool {
4754     Instruction *I = dyn_cast<Instruction>(V);
4755     return (!I || !TheLoop->contains(I));
4756   };
4757 
4758   // Worklist containing uniform instructions demanding lane 0.
4759   SetVector<Instruction *> Worklist;
4760   BasicBlock *Latch = TheLoop->getLoopLatch();
4761 
4762   // Add uniform instructions demanding lane 0 to the worklist. Instructions
4763   // that are scalar with predication must not be considered uniform after
4764   // vectorization, because that would create an erroneous replicating region
4765   // where only a single instance out of VF should be formed.
4766   // TODO: optimize such seldom cases if found important, see PR40816.
4767   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4768     if (isOutOfScope(I)) {
4769       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
4770                         << *I << "\n");
4771       return;
4772     }
4773     if (isScalarWithPredication(I, VF)) {
4774       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4775                         << *I << "\n");
4776       return;
4777     }
4778     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4779     Worklist.insert(I);
4780   };
4781 
4782   // Start with the conditional branch. If the branch condition is an
4783   // instruction contained in the loop that is only used by the branch, it is
4784   // uniform.
4785   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4786   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4787     addToWorklistIfAllowed(Cmp);
4788 
4789   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
4790     InstWidening WideningDecision = getWideningDecision(I, VF);
4791     assert(WideningDecision != CM_Unknown &&
4792            "Widening decision should be ready at this moment");
4793 
4794     // A uniform memory op is itself uniform.  We exclude uniform stores
4795     // here as they demand the last lane, not the first one.
4796     if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
4797       assert(WideningDecision == CM_Scalarize);
4798       return true;
4799     }
4800 
4801     return (WideningDecision == CM_Widen ||
4802             WideningDecision == CM_Widen_Reverse ||
4803             WideningDecision == CM_Interleave);
4804   };
4805 
4806 
4807   // Returns true if Ptr is the pointer operand of a memory access instruction
4808   // I, and I is known to not require scalarization.
4809   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4810     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4811   };
4812 
4813   // Holds a list of values which are known to have at least one uniform use.
4814   // Note that there may be other uses which aren't uniform.  A "uniform use"
4815   // here is something which only demands lane 0 of the unrolled iterations;
4816   // it does not imply that all lanes produce the same value (e.g. this is not
4817   // the usual meaning of uniform)
4818   SetVector<Value *> HasUniformUse;
4819 
4820   // Scan the loop for instructions which are either a) known to have only
4821   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
4822   for (auto *BB : TheLoop->blocks())
4823     for (auto &I : *BB) {
4824       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
4825         switch (II->getIntrinsicID()) {
4826         case Intrinsic::sideeffect:
4827         case Intrinsic::experimental_noalias_scope_decl:
4828         case Intrinsic::assume:
4829         case Intrinsic::lifetime_start:
4830         case Intrinsic::lifetime_end:
4831           if (TheLoop->hasLoopInvariantOperands(&I))
4832             addToWorklistIfAllowed(&I);
4833           break;
4834         default:
4835           break;
4836         }
4837       }
4838 
4839       // ExtractValue instructions must be uniform, because the operands are
4840       // known to be loop-invariant.
4841       if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
4842         assert(isOutOfScope(EVI->getAggregateOperand()) &&
4843                "Expected aggregate value to be loop invariant");
4844         addToWorklistIfAllowed(EVI);
4845         continue;
4846       }
4847 
4848       // If there's no pointer operand, there's nothing to do.
4849       auto *Ptr = getLoadStorePointerOperand(&I);
4850       if (!Ptr)
4851         continue;
4852 
4853       // A uniform memory op is itself uniform.  We exclude uniform stores
4854       // here as they demand the last lane, not the first one.
4855       if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
4856         addToWorklistIfAllowed(&I);
4857 
4858       if (isUniformDecision(&I, VF)) {
4859         assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
4860         HasUniformUse.insert(Ptr);
4861       }
4862     }
4863 
4864   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
4865   // demanding) users.  Since loops are assumed to be in LCSSA form, this
4866   // disallows uses outside the loop as well.
4867   for (auto *V : HasUniformUse) {
4868     if (isOutOfScope(V))
4869       continue;
4870     auto *I = cast<Instruction>(V);
4871     auto UsersAreMemAccesses =
4872       llvm::all_of(I->users(), [&](User *U) -> bool {
4873         return isVectorizedMemAccessUse(cast<Instruction>(U), V);
4874       });
4875     if (UsersAreMemAccesses)
4876       addToWorklistIfAllowed(I);
4877   }
4878 
4879   // Expand Worklist in topological order: whenever a new instruction
4880   // is added , its users should be already inside Worklist.  It ensures
4881   // a uniform instruction will only be used by uniform instructions.
4882   unsigned idx = 0;
4883   while (idx != Worklist.size()) {
4884     Instruction *I = Worklist[idx++];
4885 
4886     for (auto OV : I->operand_values()) {
4887       // isOutOfScope operands cannot be uniform instructions.
4888       if (isOutOfScope(OV))
4889         continue;
4890       // First order recurrence Phi's should typically be considered
4891       // non-uniform.
4892       auto *OP = dyn_cast<PHINode>(OV);
4893       if (OP && Legal->isFirstOrderRecurrence(OP))
4894         continue;
4895       // If all the users of the operand are uniform, then add the
4896       // operand into the uniform worklist.
4897       auto *OI = cast<Instruction>(OV);
4898       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4899             auto *J = cast<Instruction>(U);
4900             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
4901           }))
4902         addToWorklistIfAllowed(OI);
4903     }
4904   }
4905 
4906   // For an instruction to be added into Worklist above, all its users inside
4907   // the loop should also be in Worklist. However, this condition cannot be
4908   // true for phi nodes that form a cyclic dependence. We must process phi
4909   // nodes separately. An induction variable will remain uniform if all users
4910   // of the induction variable and induction variable update remain uniform.
4911   // The code below handles both pointer and non-pointer induction variables.
4912   for (auto &Induction : Legal->getInductionVars()) {
4913     auto *Ind = Induction.first;
4914     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4915 
4916     // Determine if all users of the induction variable are uniform after
4917     // vectorization.
4918     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4919       auto *I = cast<Instruction>(U);
4920       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4921              isVectorizedMemAccessUse(I, Ind);
4922     });
4923     if (!UniformInd)
4924       continue;
4925 
4926     // Determine if all users of the induction variable update instruction are
4927     // uniform after vectorization.
4928     auto UniformIndUpdate =
4929         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4930           auto *I = cast<Instruction>(U);
4931           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4932                  isVectorizedMemAccessUse(I, IndUpdate);
4933         });
4934     if (!UniformIndUpdate)
4935       continue;
4936 
4937     // The induction variable and its update instruction will remain uniform.
4938     addToWorklistIfAllowed(Ind);
4939     addToWorklistIfAllowed(IndUpdate);
4940   }
4941 
4942   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4943 }
4944 
4945 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4946   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4947 
4948   if (Legal->getRuntimePointerChecking()->Need) {
4949     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4950         "runtime pointer checks needed. Enable vectorization of this "
4951         "loop with '#pragma clang loop vectorize(enable)' when "
4952         "compiling with -Os/-Oz",
4953         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4954     return true;
4955   }
4956 
4957   if (!PSE.getPredicate().isAlwaysTrue()) {
4958     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4959         "runtime SCEV checks needed. Enable vectorization of this "
4960         "loop with '#pragma clang loop vectorize(enable)' when "
4961         "compiling with -Os/-Oz",
4962         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4963     return true;
4964   }
4965 
4966   // FIXME: Avoid specializing for stride==1 instead of bailing out.
4967   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4968     reportVectorizationFailure("Runtime stride check for small trip count",
4969         "runtime stride == 1 checks needed. Enable vectorization of "
4970         "this loop without such check by compiling with -Os/-Oz",
4971         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4972     return true;
4973   }
4974 
4975   return false;
4976 }
4977 
4978 ElementCount
4979 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
4980   if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
4981     return ElementCount::getScalable(0);
4982 
4983   if (Hints->isScalableVectorizationDisabled()) {
4984     reportVectorizationInfo("Scalable vectorization is explicitly disabled",
4985                             "ScalableVectorizationDisabled", ORE, TheLoop);
4986     return ElementCount::getScalable(0);
4987   }
4988 
4989   LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
4990 
4991   auto MaxScalableVF = ElementCount::getScalable(
4992       std::numeric_limits<ElementCount::ScalarTy>::max());
4993 
4994   // Test that the loop-vectorizer can legalize all operations for this MaxVF.
4995   // FIXME: While for scalable vectors this is currently sufficient, this should
4996   // be replaced by a more detailed mechanism that filters out specific VFs,
4997   // instead of invalidating vectorization for a whole set of VFs based on the
4998   // MaxVF.
4999 
5000   // Disable scalable vectorization if the loop contains unsupported reductions.
5001   if (!canVectorizeReductions(MaxScalableVF)) {
5002     reportVectorizationInfo(
5003         "Scalable vectorization not supported for the reduction "
5004         "operations found in this loop.",
5005         "ScalableVFUnfeasible", ORE, TheLoop);
5006     return ElementCount::getScalable(0);
5007   }
5008 
5009   // Disable scalable vectorization if the loop contains any instructions
5010   // with element types not supported for scalable vectors.
5011   if (any_of(ElementTypesInLoop, [&](Type *Ty) {
5012         return !Ty->isVoidTy() &&
5013                !this->TTI.isElementTypeLegalForScalableVector(Ty);
5014       })) {
5015     reportVectorizationInfo("Scalable vectorization is not supported "
5016                             "for all element types found in this loop.",
5017                             "ScalableVFUnfeasible", ORE, TheLoop);
5018     return ElementCount::getScalable(0);
5019   }
5020 
5021   if (Legal->isSafeForAnyVectorWidth())
5022     return MaxScalableVF;
5023 
5024   // Limit MaxScalableVF by the maximum safe dependence distance.
5025   Optional<unsigned> MaxVScale = TTI.getMaxVScale();
5026   if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange))
5027     MaxVScale =
5028         TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
5029   MaxScalableVF = ElementCount::getScalable(
5030       MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
5031   if (!MaxScalableVF)
5032     reportVectorizationInfo(
5033         "Max legal vector width too small, scalable vectorization "
5034         "unfeasible.",
5035         "ScalableVFUnfeasible", ORE, TheLoop);
5036 
5037   return MaxScalableVF;
5038 }
5039 
5040 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
5041     unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) {
5042   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5043   unsigned SmallestType, WidestType;
5044   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5045 
5046   // Get the maximum safe dependence distance in bits computed by LAA.
5047   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5048   // the memory accesses that is most restrictive (involved in the smallest
5049   // dependence distance).
5050   unsigned MaxSafeElements =
5051       PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
5052 
5053   auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
5054   auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
5055 
5056   LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
5057                     << ".\n");
5058   LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
5059                     << ".\n");
5060 
5061   // First analyze the UserVF, fall back if the UserVF should be ignored.
5062   if (UserVF) {
5063     auto MaxSafeUserVF =
5064         UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
5065 
5066     if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
5067       // If `VF=vscale x N` is safe, then so is `VF=N`
5068       if (UserVF.isScalable())
5069         return FixedScalableVFPair(
5070             ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
5071       else
5072         return UserVF;
5073     }
5074 
5075     assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
5076 
5077     // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
5078     // is better to ignore the hint and let the compiler choose a suitable VF.
5079     if (!UserVF.isScalable()) {
5080       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5081                         << " is unsafe, clamping to max safe VF="
5082                         << MaxSafeFixedVF << ".\n");
5083       ORE->emit([&]() {
5084         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5085                                           TheLoop->getStartLoc(),
5086                                           TheLoop->getHeader())
5087                << "User-specified vectorization factor "
5088                << ore::NV("UserVectorizationFactor", UserVF)
5089                << " is unsafe, clamping to maximum safe vectorization factor "
5090                << ore::NV("VectorizationFactor", MaxSafeFixedVF);
5091       });
5092       return MaxSafeFixedVF;
5093     }
5094 
5095     if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
5096       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5097                         << " is ignored because scalable vectors are not "
5098                            "available.\n");
5099       ORE->emit([&]() {
5100         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5101                                           TheLoop->getStartLoc(),
5102                                           TheLoop->getHeader())
5103                << "User-specified vectorization factor "
5104                << ore::NV("UserVectorizationFactor", UserVF)
5105                << " is ignored because the target does not support scalable "
5106                   "vectors. The compiler will pick a more suitable value.";
5107       });
5108     } else {
5109       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5110                         << " is unsafe. Ignoring scalable UserVF.\n");
5111       ORE->emit([&]() {
5112         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5113                                           TheLoop->getStartLoc(),
5114                                           TheLoop->getHeader())
5115                << "User-specified vectorization factor "
5116                << ore::NV("UserVectorizationFactor", UserVF)
5117                << " is unsafe. Ignoring the hint to let the compiler pick a "
5118                   "more suitable value.";
5119       });
5120     }
5121   }
5122 
5123   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5124                     << " / " << WidestType << " bits.\n");
5125 
5126   FixedScalableVFPair Result(ElementCount::getFixed(1),
5127                              ElementCount::getScalable(0));
5128   if (auto MaxVF =
5129           getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5130                                   MaxSafeFixedVF, FoldTailByMasking))
5131     Result.FixedVF = MaxVF;
5132 
5133   if (auto MaxVF =
5134           getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5135                                   MaxSafeScalableVF, FoldTailByMasking))
5136     if (MaxVF.isScalable()) {
5137       Result.ScalableVF = MaxVF;
5138       LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
5139                         << "\n");
5140     }
5141 
5142   return Result;
5143 }
5144 
5145 FixedScalableVFPair
5146 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5147   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5148     // TODO: It may by useful to do since it's still likely to be dynamically
5149     // uniform if the target can skip.
5150     reportVectorizationFailure(
5151         "Not inserting runtime ptr check for divergent target",
5152         "runtime pointer checks needed. Not enabled for divergent target",
5153         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5154     return FixedScalableVFPair::getNone();
5155   }
5156 
5157   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5158   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5159   if (TC == 1) {
5160     reportVectorizationFailure("Single iteration (non) loop",
5161         "loop trip count is one, irrelevant for vectorization",
5162         "SingleIterationLoop", ORE, TheLoop);
5163     return FixedScalableVFPair::getNone();
5164   }
5165 
5166   switch (ScalarEpilogueStatus) {
5167   case CM_ScalarEpilogueAllowed:
5168     return computeFeasibleMaxVF(TC, UserVF, false);
5169   case CM_ScalarEpilogueNotAllowedUsePredicate:
5170     LLVM_FALLTHROUGH;
5171   case CM_ScalarEpilogueNotNeededUsePredicate:
5172     LLVM_DEBUG(
5173         dbgs() << "LV: vector predicate hint/switch found.\n"
5174                << "LV: Not allowing scalar epilogue, creating predicated "
5175                << "vector loop.\n");
5176     break;
5177   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5178     // fallthrough as a special case of OptForSize
5179   case CM_ScalarEpilogueNotAllowedOptSize:
5180     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5181       LLVM_DEBUG(
5182           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5183     else
5184       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5185                         << "count.\n");
5186 
5187     // Bail if runtime checks are required, which are not good when optimising
5188     // for size.
5189     if (runtimeChecksRequired())
5190       return FixedScalableVFPair::getNone();
5191 
5192     break;
5193   }
5194 
5195   // The only loops we can vectorize without a scalar epilogue, are loops with
5196   // a bottom-test and a single exiting block. We'd have to handle the fact
5197   // that not every instruction executes on the last iteration.  This will
5198   // require a lane mask which varies through the vector loop body.  (TODO)
5199   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
5200     // If there was a tail-folding hint/switch, but we can't fold the tail by
5201     // masking, fallback to a vectorization with a scalar epilogue.
5202     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5203       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5204                            "scalar epilogue instead.\n");
5205       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5206       return computeFeasibleMaxVF(TC, UserVF, false);
5207     }
5208     return FixedScalableVFPair::getNone();
5209   }
5210 
5211   // Now try the tail folding
5212 
5213   // Invalidate interleave groups that require an epilogue if we can't mask
5214   // the interleave-group.
5215   if (!useMaskedInterleavedAccesses(TTI)) {
5216     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5217            "No decisions should have been taken at this point");
5218     // Note: There is no need to invalidate any cost modeling decisions here, as
5219     // non where taken so far.
5220     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5221   }
5222 
5223   FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true);
5224   // Avoid tail folding if the trip count is known to be a multiple of any VF
5225   // we chose.
5226   // FIXME: The condition below pessimises the case for fixed-width vectors,
5227   // when scalable VFs are also candidates for vectorization.
5228   if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) {
5229     ElementCount MaxFixedVF = MaxFactors.FixedVF;
5230     assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) &&
5231            "MaxFixedVF must be a power of 2");
5232     unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC
5233                                    : MaxFixedVF.getFixedValue();
5234     ScalarEvolution *SE = PSE.getSE();
5235     const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
5236     const SCEV *ExitCount = SE->getAddExpr(
5237         BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
5238     const SCEV *Rem = SE->getURemExpr(
5239         SE->applyLoopGuards(ExitCount, TheLoop),
5240         SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
5241     if (Rem->isZero()) {
5242       // Accept MaxFixedVF if we do not have a tail.
5243       LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5244       return MaxFactors;
5245     }
5246   }
5247 
5248   // For scalable vectors don't use tail folding for low trip counts or
5249   // optimizing for code size. We only permit this if the user has explicitly
5250   // requested it.
5251   if (ScalarEpilogueStatus != CM_ScalarEpilogueNotNeededUsePredicate &&
5252       ScalarEpilogueStatus != CM_ScalarEpilogueNotAllowedUsePredicate &&
5253       MaxFactors.ScalableVF.isVector())
5254     MaxFactors.ScalableVF = ElementCount::getScalable(0);
5255 
5256   // If we don't know the precise trip count, or if the trip count that we
5257   // found modulo the vectorization factor is not zero, try to fold the tail
5258   // by masking.
5259   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5260   if (Legal->prepareToFoldTailByMasking()) {
5261     FoldTailByMasking = true;
5262     return MaxFactors;
5263   }
5264 
5265   // If there was a tail-folding hint/switch, but we can't fold the tail by
5266   // masking, fallback to a vectorization with a scalar epilogue.
5267   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5268     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5269                          "scalar epilogue instead.\n");
5270     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5271     return MaxFactors;
5272   }
5273 
5274   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
5275     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5276     return FixedScalableVFPair::getNone();
5277   }
5278 
5279   if (TC == 0) {
5280     reportVectorizationFailure(
5281         "Unable to calculate the loop count due to complex control flow",
5282         "unable to calculate the loop count due to complex control flow",
5283         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5284     return FixedScalableVFPair::getNone();
5285   }
5286 
5287   reportVectorizationFailure(
5288       "Cannot optimize for size and vectorize at the same time.",
5289       "cannot optimize for size and vectorize at the same time. "
5290       "Enable vectorization of this loop with '#pragma clang loop "
5291       "vectorize(enable)' when compiling with -Os/-Oz",
5292       "NoTailLoopWithOptForSize", ORE, TheLoop);
5293   return FixedScalableVFPair::getNone();
5294 }
5295 
5296 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
5297     unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
5298     const ElementCount &MaxSafeVF, bool FoldTailByMasking) {
5299   bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
5300   TypeSize WidestRegister = TTI.getRegisterBitWidth(
5301       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5302                            : TargetTransformInfo::RGK_FixedWidthVector);
5303 
5304   // Convenience function to return the minimum of two ElementCounts.
5305   auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
5306     assert((LHS.isScalable() == RHS.isScalable()) &&
5307            "Scalable flags must match");
5308     return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
5309   };
5310 
5311   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5312   // Note that both WidestRegister and WidestType may not be a powers of 2.
5313   auto MaxVectorElementCount = ElementCount::get(
5314       PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType),
5315       ComputeScalableMaxVF);
5316   MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
5317   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5318                     << (MaxVectorElementCount * WidestType) << " bits.\n");
5319 
5320   if (!MaxVectorElementCount) {
5321     LLVM_DEBUG(dbgs() << "LV: The target has no "
5322                       << (ComputeScalableMaxVF ? "scalable" : "fixed")
5323                       << " vector registers.\n");
5324     return ElementCount::getFixed(1);
5325   }
5326 
5327   const auto TripCountEC = ElementCount::getFixed(ConstTripCount);
5328   if (ConstTripCount &&
5329       ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) &&
5330       (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) {
5331     // If loop trip count (TC) is known at compile time there is no point in
5332     // choosing VF greater than TC (as done in the loop below). Select maximum
5333     // power of two which doesn't exceed TC.
5334     // If MaxVectorElementCount is scalable, we only fall back on a fixed VF
5335     // when the TC is less than or equal to the known number of lanes.
5336     auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount);
5337     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
5338                          "exceeding the constant trip count: "
5339                       << ClampedConstTripCount << "\n");
5340     return ElementCount::getFixed(ClampedConstTripCount);
5341   }
5342 
5343   ElementCount MaxVF = MaxVectorElementCount;
5344   if (TTI.shouldMaximizeVectorBandwidth() ||
5345       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5346     auto MaxVectorElementCountMaxBW = ElementCount::get(
5347         PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType),
5348         ComputeScalableMaxVF);
5349     MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
5350 
5351     // Collect all viable vectorization factors larger than the default MaxVF
5352     // (i.e. MaxVectorElementCount).
5353     SmallVector<ElementCount, 8> VFs;
5354     for (ElementCount VS = MaxVectorElementCount * 2;
5355          ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
5356       VFs.push_back(VS);
5357 
5358     // For each VF calculate its register usage.
5359     auto RUs = calculateRegisterUsage(VFs);
5360 
5361     // Select the largest VF which doesn't require more registers than existing
5362     // ones.
5363     for (int i = RUs.size() - 1; i >= 0; --i) {
5364       bool Selected = true;
5365       for (auto &pair : RUs[i].MaxLocalUsers) {
5366         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5367         if (pair.second > TargetNumRegisters)
5368           Selected = false;
5369       }
5370       if (Selected) {
5371         MaxVF = VFs[i];
5372         break;
5373       }
5374     }
5375     if (ElementCount MinVF =
5376             TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
5377       if (ElementCount::isKnownLT(MaxVF, MinVF)) {
5378         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5379                           << ") with target's minimum: " << MinVF << '\n');
5380         MaxVF = MinVF;
5381       }
5382     }
5383   }
5384   return MaxVF;
5385 }
5386 
5387 Optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const {
5388   if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
5389     auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
5390     auto Min = Attr.getVScaleRangeMin();
5391     auto Max = Attr.getVScaleRangeMax();
5392     if (Max && Min == Max)
5393       return Max;
5394   }
5395 
5396   return TTI.getVScaleForTuning();
5397 }
5398 
5399 bool LoopVectorizationCostModel::isMoreProfitable(
5400     const VectorizationFactor &A, const VectorizationFactor &B) const {
5401   InstructionCost CostA = A.Cost;
5402   InstructionCost CostB = B.Cost;
5403 
5404   unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
5405 
5406   if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking &&
5407       MaxTripCount) {
5408     // If we are folding the tail and the trip count is a known (possibly small)
5409     // constant, the trip count will be rounded up to an integer number of
5410     // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF),
5411     // which we compare directly. When not folding the tail, the total cost will
5412     // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is
5413     // approximated with the per-lane cost below instead of using the tripcount
5414     // as here.
5415     auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue());
5416     auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue());
5417     return RTCostA < RTCostB;
5418   }
5419 
5420   // Improve estimate for the vector width if it is scalable.
5421   unsigned EstimatedWidthA = A.Width.getKnownMinValue();
5422   unsigned EstimatedWidthB = B.Width.getKnownMinValue();
5423   if (Optional<unsigned> VScale = getVScaleForTuning()) {
5424     if (A.Width.isScalable())
5425       EstimatedWidthA *= VScale.getValue();
5426     if (B.Width.isScalable())
5427       EstimatedWidthB *= VScale.getValue();
5428   }
5429 
5430   // Assume vscale may be larger than 1 (or the value being tuned for),
5431   // so that scalable vectorization is slightly favorable over fixed-width
5432   // vectorization.
5433   if (A.Width.isScalable() && !B.Width.isScalable())
5434     return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
5435 
5436   // To avoid the need for FP division:
5437   //      (CostA / A.Width) < (CostB / B.Width)
5438   // <=>  (CostA * B.Width) < (CostB * A.Width)
5439   return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
5440 }
5441 
5442 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
5443     const ElementCountSet &VFCandidates) {
5444   InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
5445   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
5446   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
5447   assert(VFCandidates.count(ElementCount::getFixed(1)) &&
5448          "Expected Scalar VF to be a candidate");
5449 
5450   const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost);
5451   VectorizationFactor ChosenFactor = ScalarCost;
5452 
5453   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5454   if (ForceVectorization && VFCandidates.size() > 1) {
5455     // Ignore scalar width, because the user explicitly wants vectorization.
5456     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5457     // evaluation.
5458     ChosenFactor.Cost = InstructionCost::getMax();
5459   }
5460 
5461   SmallVector<InstructionVFPair> InvalidCosts;
5462   for (const auto &i : VFCandidates) {
5463     // The cost for scalar VF=1 is already calculated, so ignore it.
5464     if (i.isScalar())
5465       continue;
5466 
5467     VectorizationCostTy C = expectedCost(i, &InvalidCosts);
5468     VectorizationFactor Candidate(i, C.first);
5469 
5470 #ifndef NDEBUG
5471     unsigned AssumedMinimumVscale = 1;
5472     if (Optional<unsigned> VScale = getVScaleForTuning())
5473       AssumedMinimumVscale = VScale.getValue();
5474     unsigned Width =
5475         Candidate.Width.isScalable()
5476             ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
5477             : Candidate.Width.getFixedValue();
5478     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5479                       << " costs: " << (Candidate.Cost / Width));
5480     if (i.isScalable())
5481       LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
5482                         << AssumedMinimumVscale << ")");
5483     LLVM_DEBUG(dbgs() << ".\n");
5484 #endif
5485 
5486     if (!C.second && !ForceVectorization) {
5487       LLVM_DEBUG(
5488           dbgs() << "LV: Not considering vector loop of width " << i
5489                  << " because it will not generate any vector instructions.\n");
5490       continue;
5491     }
5492 
5493     // If profitable add it to ProfitableVF list.
5494     if (isMoreProfitable(Candidate, ScalarCost))
5495       ProfitableVFs.push_back(Candidate);
5496 
5497     if (isMoreProfitable(Candidate, ChosenFactor))
5498       ChosenFactor = Candidate;
5499   }
5500 
5501   // Emit a report of VFs with invalid costs in the loop.
5502   if (!InvalidCosts.empty()) {
5503     // Group the remarks per instruction, keeping the instruction order from
5504     // InvalidCosts.
5505     std::map<Instruction *, unsigned> Numbering;
5506     unsigned I = 0;
5507     for (auto &Pair : InvalidCosts)
5508       if (!Numbering.count(Pair.first))
5509         Numbering[Pair.first] = I++;
5510 
5511     // Sort the list, first on instruction(number) then on VF.
5512     llvm::sort(InvalidCosts,
5513                [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
5514                  if (Numbering[A.first] != Numbering[B.first])
5515                    return Numbering[A.first] < Numbering[B.first];
5516                  ElementCountComparator ECC;
5517                  return ECC(A.second, B.second);
5518                });
5519 
5520     // For a list of ordered instruction-vf pairs:
5521     //   [(load, vf1), (load, vf2), (store, vf1)]
5522     // Group the instructions together to emit separate remarks for:
5523     //   load  (vf1, vf2)
5524     //   store (vf1)
5525     auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
5526     auto Subset = ArrayRef<InstructionVFPair>();
5527     do {
5528       if (Subset.empty())
5529         Subset = Tail.take_front(1);
5530 
5531       Instruction *I = Subset.front().first;
5532 
5533       // If the next instruction is different, or if there are no other pairs,
5534       // emit a remark for the collated subset. e.g.
5535       //   [(load, vf1), (load, vf2))]
5536       // to emit:
5537       //  remark: invalid costs for 'load' at VF=(vf, vf2)
5538       if (Subset == Tail || Tail[Subset.size()].first != I) {
5539         std::string OutString;
5540         raw_string_ostream OS(OutString);
5541         assert(!Subset.empty() && "Unexpected empty range");
5542         OS << "Instruction with invalid costs prevented vectorization at VF=(";
5543         for (auto &Pair : Subset)
5544           OS << (Pair.second == Subset.front().second ? "" : ", ")
5545              << Pair.second;
5546         OS << "):";
5547         if (auto *CI = dyn_cast<CallInst>(I))
5548           OS << " call to " << CI->getCalledFunction()->getName();
5549         else
5550           OS << " " << I->getOpcodeName();
5551         OS.flush();
5552         reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
5553         Tail = Tail.drop_front(Subset.size());
5554         Subset = {};
5555       } else
5556         // Grow the subset by one element
5557         Subset = Tail.take_front(Subset.size() + 1);
5558     } while (!Tail.empty());
5559   }
5560 
5561   if (!EnableCondStoresVectorization && NumPredStores) {
5562     reportVectorizationFailure("There are conditional stores.",
5563         "store that is conditionally executed prevents vectorization",
5564         "ConditionalStore", ORE, TheLoop);
5565     ChosenFactor = ScalarCost;
5566   }
5567 
5568   LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
5569                  ChosenFactor.Cost >= ScalarCost.Cost) dbgs()
5570              << "LV: Vectorization seems to be not beneficial, "
5571              << "but was forced by a user.\n");
5572   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
5573   return ChosenFactor;
5574 }
5575 
5576 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
5577     const Loop &L, ElementCount VF) const {
5578   // Cross iteration phis such as reductions need special handling and are
5579   // currently unsupported.
5580   if (any_of(L.getHeader()->phis(),
5581              [&](PHINode &Phi) { return Legal->isFirstOrderRecurrence(&Phi); }))
5582     return false;
5583 
5584   // Phis with uses outside of the loop require special handling and are
5585   // currently unsupported.
5586   for (auto &Entry : Legal->getInductionVars()) {
5587     // Look for uses of the value of the induction at the last iteration.
5588     Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
5589     for (User *U : PostInc->users())
5590       if (!L.contains(cast<Instruction>(U)))
5591         return false;
5592     // Look for uses of penultimate value of the induction.
5593     for (User *U : Entry.first->users())
5594       if (!L.contains(cast<Instruction>(U)))
5595         return false;
5596   }
5597 
5598   // Induction variables that are widened require special handling that is
5599   // currently not supported.
5600   if (any_of(Legal->getInductionVars(), [&](auto &Entry) {
5601         return !(this->isScalarAfterVectorization(Entry.first, VF) ||
5602                  this->isProfitableToScalarize(Entry.first, VF));
5603       }))
5604     return false;
5605 
5606   // Epilogue vectorization code has not been auditted to ensure it handles
5607   // non-latch exits properly.  It may be fine, but it needs auditted and
5608   // tested.
5609   if (L.getExitingBlock() != L.getLoopLatch())
5610     return false;
5611 
5612   return true;
5613 }
5614 
5615 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5616     const ElementCount VF) const {
5617   // FIXME: We need a much better cost-model to take different parameters such
5618   // as register pressure, code size increase and cost of extra branches into
5619   // account. For now we apply a very crude heuristic and only consider loops
5620   // with vectorization factors larger than a certain value.
5621   // We also consider epilogue vectorization unprofitable for targets that don't
5622   // consider interleaving beneficial (eg. MVE).
5623   if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
5624     return false;
5625   // FIXME: We should consider changing the threshold for scalable
5626   // vectors to take VScaleForTuning into account.
5627   if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF)
5628     return true;
5629   return false;
5630 }
5631 
5632 VectorizationFactor
5633 LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
5634     const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
5635   VectorizationFactor Result = VectorizationFactor::Disabled();
5636   if (!EnableEpilogueVectorization) {
5637     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
5638     return Result;
5639   }
5640 
5641   if (!isScalarEpilogueAllowed()) {
5642     LLVM_DEBUG(
5643         dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
5644                   "allowed.\n";);
5645     return Result;
5646   }
5647 
5648   // Not really a cost consideration, but check for unsupported cases here to
5649   // simplify the logic.
5650   if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
5651     LLVM_DEBUG(
5652         dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
5653                   "not a supported candidate.\n";);
5654     return Result;
5655   }
5656 
5657   if (EpilogueVectorizationForceVF > 1) {
5658     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
5659     ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
5660     if (LVP.hasPlanWithVF(ForcedEC))
5661       return {ForcedEC, 0};
5662     else {
5663       LLVM_DEBUG(
5664           dbgs()
5665               << "LEV: Epilogue vectorization forced factor is not viable.\n";);
5666       return Result;
5667     }
5668   }
5669 
5670   if (TheLoop->getHeader()->getParent()->hasOptSize() ||
5671       TheLoop->getHeader()->getParent()->hasMinSize()) {
5672     LLVM_DEBUG(
5673         dbgs()
5674             << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
5675     return Result;
5676   }
5677 
5678   if (!isEpilogueVectorizationProfitable(MainLoopVF)) {
5679     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
5680                          "this loop\n");
5681     return Result;
5682   }
5683 
5684   // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
5685   // the main loop handles 8 lanes per iteration. We could still benefit from
5686   // vectorizing the epilogue loop with VF=4.
5687   ElementCount EstimatedRuntimeVF = MainLoopVF;
5688   if (MainLoopVF.isScalable()) {
5689     EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
5690     if (Optional<unsigned> VScale = getVScaleForTuning())
5691       EstimatedRuntimeVF *= VScale.getValue();
5692   }
5693 
5694   for (auto &NextVF : ProfitableVFs)
5695     if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
5696           ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) ||
5697          ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) &&
5698         (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) &&
5699         LVP.hasPlanWithVF(NextVF.Width))
5700       Result = NextVF;
5701 
5702   if (Result != VectorizationFactor::Disabled())
5703     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5704                       << Result.Width << "\n";);
5705   return Result;
5706 }
5707 
5708 std::pair<unsigned, unsigned>
5709 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5710   unsigned MinWidth = -1U;
5711   unsigned MaxWidth = 8;
5712   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5713   // For in-loop reductions, no element types are added to ElementTypesInLoop
5714   // if there are no loads/stores in the loop. In this case, check through the
5715   // reduction variables to determine the maximum width.
5716   if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
5717     // Reset MaxWidth so that we can find the smallest type used by recurrences
5718     // in the loop.
5719     MaxWidth = -1U;
5720     for (auto &PhiDescriptorPair : Legal->getReductionVars()) {
5721       const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
5722       // When finding the min width used by the recurrence we need to account
5723       // for casts on the input operands of the recurrence.
5724       MaxWidth = std::min<unsigned>(
5725           MaxWidth, std::min<unsigned>(
5726                         RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
5727                         RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
5728     }
5729   } else {
5730     for (Type *T : ElementTypesInLoop) {
5731       MinWidth = std::min<unsigned>(
5732           MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
5733       MaxWidth = std::max<unsigned>(
5734           MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
5735     }
5736   }
5737   return {MinWidth, MaxWidth};
5738 }
5739 
5740 void LoopVectorizationCostModel::collectElementTypesForWidening() {
5741   ElementTypesInLoop.clear();
5742   // For each block.
5743   for (BasicBlock *BB : TheLoop->blocks()) {
5744     // For each instruction in the loop.
5745     for (Instruction &I : BB->instructionsWithoutDebug()) {
5746       Type *T = I.getType();
5747 
5748       // Skip ignored values.
5749       if (ValuesToIgnore.count(&I))
5750         continue;
5751 
5752       // Only examine Loads, Stores and PHINodes.
5753       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5754         continue;
5755 
5756       // Examine PHI nodes that are reduction variables. Update the type to
5757       // account for the recurrence type.
5758       if (auto *PN = dyn_cast<PHINode>(&I)) {
5759         if (!Legal->isReductionVariable(PN))
5760           continue;
5761         const RecurrenceDescriptor &RdxDesc =
5762             Legal->getReductionVars().find(PN)->second;
5763         if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
5764             TTI.preferInLoopReduction(RdxDesc.getOpcode(),
5765                                       RdxDesc.getRecurrenceType(),
5766                                       TargetTransformInfo::ReductionFlags()))
5767           continue;
5768         T = RdxDesc.getRecurrenceType();
5769       }
5770 
5771       // Examine the stored values.
5772       if (auto *ST = dyn_cast<StoreInst>(&I))
5773         T = ST->getValueOperand()->getType();
5774 
5775       assert(T->isSized() &&
5776              "Expected the load/store/recurrence type to be sized");
5777 
5778       ElementTypesInLoop.insert(T);
5779     }
5780   }
5781 }
5782 
5783 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5784                                                            unsigned LoopCost) {
5785   // -- The interleave heuristics --
5786   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5787   // There are many micro-architectural considerations that we can't predict
5788   // at this level. For example, frontend pressure (on decode or fetch) due to
5789   // code size, or the number and capabilities of the execution ports.
5790   //
5791   // We use the following heuristics to select the interleave count:
5792   // 1. If the code has reductions, then we interleave to break the cross
5793   // iteration dependency.
5794   // 2. If the loop is really small, then we interleave to reduce the loop
5795   // overhead.
5796   // 3. We don't interleave if we think that we will spill registers to memory
5797   // due to the increased register pressure.
5798 
5799   if (!isScalarEpilogueAllowed())
5800     return 1;
5801 
5802   // We used the distance for the interleave count.
5803   if (Legal->getMaxSafeDepDistBytes() != -1U)
5804     return 1;
5805 
5806   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5807   const bool HasReductions = !Legal->getReductionVars().empty();
5808   // Do not interleave loops with a relatively small known or estimated trip
5809   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
5810   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
5811   // because with the above conditions interleaving can expose ILP and break
5812   // cross iteration dependences for reductions.
5813   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
5814       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
5815     return 1;
5816 
5817   RegisterUsage R = calculateRegisterUsage({VF})[0];
5818   // We divide by these constants so assume that we have at least one
5819   // instruction that uses at least one register.
5820   for (auto& pair : R.MaxLocalUsers) {
5821     pair.second = std::max(pair.second, 1U);
5822   }
5823 
5824   // We calculate the interleave count using the following formula.
5825   // Subtract the number of loop invariants from the number of available
5826   // registers. These registers are used by all of the interleaved instances.
5827   // Next, divide the remaining registers by the number of registers that is
5828   // required by the loop, in order to estimate how many parallel instances
5829   // fit without causing spills. All of this is rounded down if necessary to be
5830   // a power of two. We want power of two interleave count to simplify any
5831   // addressing operations or alignment considerations.
5832   // We also want power of two interleave counts to ensure that the induction
5833   // variable of the vector loop wraps to zero, when tail is folded by masking;
5834   // this currently happens when OptForSize, in which case IC is set to 1 above.
5835   unsigned IC = UINT_MAX;
5836 
5837   for (auto& pair : R.MaxLocalUsers) {
5838     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5839     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5840                       << " registers of "
5841                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5842     if (VF.isScalar()) {
5843       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5844         TargetNumRegisters = ForceTargetNumScalarRegs;
5845     } else {
5846       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5847         TargetNumRegisters = ForceTargetNumVectorRegs;
5848     }
5849     unsigned MaxLocalUsers = pair.second;
5850     unsigned LoopInvariantRegs = 0;
5851     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5852       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5853 
5854     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5855     // Don't count the induction variable as interleaved.
5856     if (EnableIndVarRegisterHeur) {
5857       TmpIC =
5858           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5859                         std::max(1U, (MaxLocalUsers - 1)));
5860     }
5861 
5862     IC = std::min(IC, TmpIC);
5863   }
5864 
5865   // Clamp the interleave ranges to reasonable counts.
5866   unsigned MaxInterleaveCount =
5867       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
5868 
5869   // Check if the user has overridden the max.
5870   if (VF.isScalar()) {
5871     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5872       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5873   } else {
5874     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5875       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5876   }
5877 
5878   // If trip count is known or estimated compile time constant, limit the
5879   // interleave count to be less than the trip count divided by VF, provided it
5880   // is at least 1.
5881   //
5882   // For scalable vectors we can't know if interleaving is beneficial. It may
5883   // not be beneficial for small loops if none of the lanes in the second vector
5884   // iterations is enabled. However, for larger loops, there is likely to be a
5885   // similar benefit as for fixed-width vectors. For now, we choose to leave
5886   // the InterleaveCount as if vscale is '1', although if some information about
5887   // the vector is known (e.g. min vector size), we can make a better decision.
5888   if (BestKnownTC) {
5889     MaxInterleaveCount =
5890         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
5891     // Make sure MaxInterleaveCount is greater than 0.
5892     MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
5893   }
5894 
5895   assert(MaxInterleaveCount > 0 &&
5896          "Maximum interleave count must be greater than 0");
5897 
5898   // Clamp the calculated IC to be between the 1 and the max interleave count
5899   // that the target and trip count allows.
5900   if (IC > MaxInterleaveCount)
5901     IC = MaxInterleaveCount;
5902   else
5903     // Make sure IC is greater than 0.
5904     IC = std::max(1u, IC);
5905 
5906   assert(IC > 0 && "Interleave count must be greater than 0.");
5907 
5908   // If we did not calculate the cost for VF (because the user selected the VF)
5909   // then we calculate the cost of VF here.
5910   if (LoopCost == 0) {
5911     InstructionCost C = expectedCost(VF).first;
5912     assert(C.isValid() && "Expected to have chosen a VF with valid cost");
5913     LoopCost = *C.getValue();
5914   }
5915 
5916   assert(LoopCost && "Non-zero loop cost expected");
5917 
5918   // Interleave if we vectorized this loop and there is a reduction that could
5919   // benefit from interleaving.
5920   if (VF.isVector() && HasReductions) {
5921     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5922     return IC;
5923   }
5924 
5925   // For any scalar loop that either requires runtime checks or predication we
5926   // are better off leaving this to the unroller. Note that if we've already
5927   // vectorized the loop we will have done the runtime check and so interleaving
5928   // won't require further checks.
5929   bool ScalarInterleavingRequiresPredication =
5930       (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
5931          return Legal->blockNeedsPredication(BB);
5932        }));
5933   bool ScalarInterleavingRequiresRuntimePointerCheck =
5934       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
5935 
5936   // We want to interleave small loops in order to reduce the loop overhead and
5937   // potentially expose ILP opportunities.
5938   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5939                     << "LV: IC is " << IC << '\n'
5940                     << "LV: VF is " << VF << '\n');
5941   const bool AggressivelyInterleaveReductions =
5942       TTI.enableAggressiveInterleaving(HasReductions);
5943   if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5944       !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5945     // We assume that the cost overhead is 1 and we use the cost model
5946     // to estimate the cost of the loop and interleave until the cost of the
5947     // loop overhead is about 5% of the cost of the loop.
5948     unsigned SmallIC =
5949         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5950 
5951     // Interleave until store/load ports (estimated by max interleave count) are
5952     // saturated.
5953     unsigned NumStores = Legal->getNumStores();
5954     unsigned NumLoads = Legal->getNumLoads();
5955     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5956     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5957 
5958     // There is little point in interleaving for reductions containing selects
5959     // and compares when VF=1 since it may just create more overhead than it's
5960     // worth for loops with small trip counts. This is because we still have to
5961     // do the final reduction after the loop.
5962     bool HasSelectCmpReductions =
5963         HasReductions &&
5964         any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5965           const RecurrenceDescriptor &RdxDesc = Reduction.second;
5966           return RecurrenceDescriptor::isSelectCmpRecurrenceKind(
5967               RdxDesc.getRecurrenceKind());
5968         });
5969     if (HasSelectCmpReductions) {
5970       LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5971       return 1;
5972     }
5973 
5974     // If we have a scalar reduction (vector reductions are already dealt with
5975     // by this point), we can increase the critical path length if the loop
5976     // we're interleaving is inside another loop. For tree-wise reductions
5977     // set the limit to 2, and for ordered reductions it's best to disable
5978     // interleaving entirely.
5979     if (HasReductions && TheLoop->getLoopDepth() > 1) {
5980       bool HasOrderedReductions =
5981           any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5982             const RecurrenceDescriptor &RdxDesc = Reduction.second;
5983             return RdxDesc.isOrdered();
5984           });
5985       if (HasOrderedReductions) {
5986         LLVM_DEBUG(
5987             dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5988         return 1;
5989       }
5990 
5991       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5992       SmallIC = std::min(SmallIC, F);
5993       StoresIC = std::min(StoresIC, F);
5994       LoadsIC = std::min(LoadsIC, F);
5995     }
5996 
5997     if (EnableLoadStoreRuntimeInterleave &&
5998         std::max(StoresIC, LoadsIC) > SmallIC) {
5999       LLVM_DEBUG(
6000           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
6001       return std::max(StoresIC, LoadsIC);
6002     }
6003 
6004     // If there are scalar reductions and TTI has enabled aggressive
6005     // interleaving for reductions, we will interleave to expose ILP.
6006     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
6007         AggressivelyInterleaveReductions) {
6008       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6009       // Interleave no less than SmallIC but not as aggressive as the normal IC
6010       // to satisfy the rare situation when resources are too limited.
6011       return std::max(IC / 2, SmallIC);
6012     } else {
6013       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
6014       return SmallIC;
6015     }
6016   }
6017 
6018   // Interleave if this is a large loop (small loops are already dealt with by
6019   // this point) that could benefit from interleaving.
6020   if (AggressivelyInterleaveReductions) {
6021     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6022     return IC;
6023   }
6024 
6025   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
6026   return 1;
6027 }
6028 
6029 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
6030 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
6031   // This function calculates the register usage by measuring the highest number
6032   // of values that are alive at a single location. Obviously, this is a very
6033   // rough estimation. We scan the loop in a topological order in order and
6034   // assign a number to each instruction. We use RPO to ensure that defs are
6035   // met before their users. We assume that each instruction that has in-loop
6036   // users starts an interval. We record every time that an in-loop value is
6037   // used, so we have a list of the first and last occurrences of each
6038   // instruction. Next, we transpose this data structure into a multi map that
6039   // holds the list of intervals that *end* at a specific location. This multi
6040   // map allows us to perform a linear search. We scan the instructions linearly
6041   // and record each time that a new interval starts, by placing it in a set.
6042   // If we find this value in the multi-map then we remove it from the set.
6043   // The max register usage is the maximum size of the set.
6044   // We also search for instructions that are defined outside the loop, but are
6045   // used inside the loop. We need this number separately from the max-interval
6046   // usage number because when we unroll, loop-invariant values do not take
6047   // more register.
6048   LoopBlocksDFS DFS(TheLoop);
6049   DFS.perform(LI);
6050 
6051   RegisterUsage RU;
6052 
6053   // Each 'key' in the map opens a new interval. The values
6054   // of the map are the index of the 'last seen' usage of the
6055   // instruction that is the key.
6056   using IntervalMap = DenseMap<Instruction *, unsigned>;
6057 
6058   // Maps instruction to its index.
6059   SmallVector<Instruction *, 64> IdxToInstr;
6060   // Marks the end of each interval.
6061   IntervalMap EndPoint;
6062   // Saves the list of instruction indices that are used in the loop.
6063   SmallPtrSet<Instruction *, 8> Ends;
6064   // Saves the list of values that are used in the loop but are
6065   // defined outside the loop, such as arguments and constants.
6066   SmallPtrSet<Value *, 8> LoopInvariants;
6067 
6068   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
6069     for (Instruction &I : BB->instructionsWithoutDebug()) {
6070       IdxToInstr.push_back(&I);
6071 
6072       // Save the end location of each USE.
6073       for (Value *U : I.operands()) {
6074         auto *Instr = dyn_cast<Instruction>(U);
6075 
6076         // Ignore non-instruction values such as arguments, constants, etc.
6077         if (!Instr)
6078           continue;
6079 
6080         // If this instruction is outside the loop then record it and continue.
6081         if (!TheLoop->contains(Instr)) {
6082           LoopInvariants.insert(Instr);
6083           continue;
6084         }
6085 
6086         // Overwrite previous end points.
6087         EndPoint[Instr] = IdxToInstr.size();
6088         Ends.insert(Instr);
6089       }
6090     }
6091   }
6092 
6093   // Saves the list of intervals that end with the index in 'key'.
6094   using InstrList = SmallVector<Instruction *, 2>;
6095   DenseMap<unsigned, InstrList> TransposeEnds;
6096 
6097   // Transpose the EndPoints to a list of values that end at each index.
6098   for (auto &Interval : EndPoint)
6099     TransposeEnds[Interval.second].push_back(Interval.first);
6100 
6101   SmallPtrSet<Instruction *, 8> OpenIntervals;
6102   SmallVector<RegisterUsage, 8> RUs(VFs.size());
6103   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
6104 
6105   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
6106 
6107   // A lambda that gets the register usage for the given type and VF.
6108   const auto &TTICapture = TTI;
6109   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
6110     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
6111       return 0;
6112     InstructionCost::CostType RegUsage =
6113         *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue();
6114     assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() &&
6115            "Nonsensical values for register usage.");
6116     return RegUsage;
6117   };
6118 
6119   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
6120     Instruction *I = IdxToInstr[i];
6121 
6122     // Remove all of the instructions that end at this location.
6123     InstrList &List = TransposeEnds[i];
6124     for (Instruction *ToRemove : List)
6125       OpenIntervals.erase(ToRemove);
6126 
6127     // Ignore instructions that are never used within the loop.
6128     if (!Ends.count(I))
6129       continue;
6130 
6131     // Skip ignored values.
6132     if (ValuesToIgnore.count(I))
6133       continue;
6134 
6135     // For each VF find the maximum usage of registers.
6136     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
6137       // Count the number of live intervals.
6138       SmallMapVector<unsigned, unsigned, 4> RegUsage;
6139 
6140       if (VFs[j].isScalar()) {
6141         for (auto Inst : OpenIntervals) {
6142           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6143           if (RegUsage.find(ClassID) == RegUsage.end())
6144             RegUsage[ClassID] = 1;
6145           else
6146             RegUsage[ClassID] += 1;
6147         }
6148       } else {
6149         collectUniformsAndScalars(VFs[j]);
6150         for (auto Inst : OpenIntervals) {
6151           // Skip ignored values for VF > 1.
6152           if (VecValuesToIgnore.count(Inst))
6153             continue;
6154           if (isScalarAfterVectorization(Inst, VFs[j])) {
6155             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6156             if (RegUsage.find(ClassID) == RegUsage.end())
6157               RegUsage[ClassID] = 1;
6158             else
6159               RegUsage[ClassID] += 1;
6160           } else {
6161             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
6162             if (RegUsage.find(ClassID) == RegUsage.end())
6163               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
6164             else
6165               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
6166           }
6167         }
6168       }
6169 
6170       for (auto& pair : RegUsage) {
6171         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
6172           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
6173         else
6174           MaxUsages[j][pair.first] = pair.second;
6175       }
6176     }
6177 
6178     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
6179                       << OpenIntervals.size() << '\n');
6180 
6181     // Add the current instruction to the list of open intervals.
6182     OpenIntervals.insert(I);
6183   }
6184 
6185   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6186     SmallMapVector<unsigned, unsigned, 4> Invariant;
6187 
6188     for (auto Inst : LoopInvariants) {
6189       unsigned Usage =
6190           VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
6191       unsigned ClassID =
6192           TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
6193       if (Invariant.find(ClassID) == Invariant.end())
6194         Invariant[ClassID] = Usage;
6195       else
6196         Invariant[ClassID] += Usage;
6197     }
6198 
6199     LLVM_DEBUG({
6200       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
6201       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
6202              << " item\n";
6203       for (const auto &pair : MaxUsages[i]) {
6204         dbgs() << "LV(REG): RegisterClass: "
6205                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6206                << " registers\n";
6207       }
6208       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
6209              << " item\n";
6210       for (const auto &pair : Invariant) {
6211         dbgs() << "LV(REG): RegisterClass: "
6212                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6213                << " registers\n";
6214       }
6215     });
6216 
6217     RU.LoopInvariantRegs = Invariant;
6218     RU.MaxLocalUsers = MaxUsages[i];
6219     RUs[i] = RU;
6220   }
6221 
6222   return RUs;
6223 }
6224 
6225 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
6226                                                            ElementCount VF) {
6227   // TODO: Cost model for emulated masked load/store is completely
6228   // broken. This hack guides the cost model to use an artificially
6229   // high enough value to practically disable vectorization with such
6230   // operations, except where previously deployed legality hack allowed
6231   // using very low cost values. This is to avoid regressions coming simply
6232   // from moving "masked load/store" check from legality to cost model.
6233   // Masked Load/Gather emulation was previously never allowed.
6234   // Limited number of Masked Store/Scatter emulation was allowed.
6235   assert(isPredicatedInst(I, VF) && "Expecting a scalar emulated instruction");
6236   return isa<LoadInst>(I) ||
6237          (isa<StoreInst>(I) &&
6238           NumPredStores > NumberOfStoresToPredicate);
6239 }
6240 
6241 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
6242   // If we aren't vectorizing the loop, or if we've already collected the
6243   // instructions to scalarize, there's nothing to do. Collection may already
6244   // have occurred if we have a user-selected VF and are now computing the
6245   // expected cost for interleaving.
6246   if (VF.isScalar() || VF.isZero() ||
6247       InstsToScalarize.find(VF) != InstsToScalarize.end())
6248     return;
6249 
6250   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6251   // not profitable to scalarize any instructions, the presence of VF in the
6252   // map will indicate that we've analyzed it already.
6253   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6254 
6255   // Find all the instructions that are scalar with predication in the loop and
6256   // determine if it would be better to not if-convert the blocks they are in.
6257   // If so, we also record the instructions to scalarize.
6258   for (BasicBlock *BB : TheLoop->blocks()) {
6259     if (!blockNeedsPredicationForAnyReason(BB))
6260       continue;
6261     for (Instruction &I : *BB)
6262       if (isScalarWithPredication(&I, VF)) {
6263         ScalarCostsTy ScalarCosts;
6264         // Do not apply discount if scalable, because that would lead to
6265         // invalid scalarization costs.
6266         // Do not apply discount logic if hacked cost is needed
6267         // for emulated masked memrefs.
6268         if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) &&
6269             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6270           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6271         // Remember that BB will remain after vectorization.
6272         PredicatedBBsAfterVectorization.insert(BB);
6273       }
6274   }
6275 }
6276 
6277 int LoopVectorizationCostModel::computePredInstDiscount(
6278     Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
6279   assert(!isUniformAfterVectorization(PredInst, VF) &&
6280          "Instruction marked uniform-after-vectorization will be predicated");
6281 
6282   // Initialize the discount to zero, meaning that the scalar version and the
6283   // vector version cost the same.
6284   InstructionCost Discount = 0;
6285 
6286   // Holds instructions to analyze. The instructions we visit are mapped in
6287   // ScalarCosts. Those instructions are the ones that would be scalarized if
6288   // we find that the scalar version costs less.
6289   SmallVector<Instruction *, 8> Worklist;
6290 
6291   // Returns true if the given instruction can be scalarized.
6292   auto canBeScalarized = [&](Instruction *I) -> bool {
6293     // We only attempt to scalarize instructions forming a single-use chain
6294     // from the original predicated block that would otherwise be vectorized.
6295     // Although not strictly necessary, we give up on instructions we know will
6296     // already be scalar to avoid traversing chains that are unlikely to be
6297     // beneficial.
6298     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6299         isScalarAfterVectorization(I, VF))
6300       return false;
6301 
6302     // If the instruction is scalar with predication, it will be analyzed
6303     // separately. We ignore it within the context of PredInst.
6304     if (isScalarWithPredication(I, VF))
6305       return false;
6306 
6307     // If any of the instruction's operands are uniform after vectorization,
6308     // the instruction cannot be scalarized. This prevents, for example, a
6309     // masked load from being scalarized.
6310     //
6311     // We assume we will only emit a value for lane zero of an instruction
6312     // marked uniform after vectorization, rather than VF identical values.
6313     // Thus, if we scalarize an instruction that uses a uniform, we would
6314     // create uses of values corresponding to the lanes we aren't emitting code
6315     // for. This behavior can be changed by allowing getScalarValue to clone
6316     // the lane zero values for uniforms rather than asserting.
6317     for (Use &U : I->operands())
6318       if (auto *J = dyn_cast<Instruction>(U.get()))
6319         if (isUniformAfterVectorization(J, VF))
6320           return false;
6321 
6322     // Otherwise, we can scalarize the instruction.
6323     return true;
6324   };
6325 
6326   // Compute the expected cost discount from scalarizing the entire expression
6327   // feeding the predicated instruction. We currently only consider expressions
6328   // that are single-use instruction chains.
6329   Worklist.push_back(PredInst);
6330   while (!Worklist.empty()) {
6331     Instruction *I = Worklist.pop_back_val();
6332 
6333     // If we've already analyzed the instruction, there's nothing to do.
6334     if (ScalarCosts.find(I) != ScalarCosts.end())
6335       continue;
6336 
6337     // Compute the cost of the vector instruction. Note that this cost already
6338     // includes the scalarization overhead of the predicated instruction.
6339     InstructionCost VectorCost = getInstructionCost(I, VF).first;
6340 
6341     // Compute the cost of the scalarized instruction. This cost is the cost of
6342     // the instruction as if it wasn't if-converted and instead remained in the
6343     // predicated block. We will scale this cost by block probability after
6344     // computing the scalarization overhead.
6345     InstructionCost ScalarCost =
6346         VF.getFixedValue() *
6347         getInstructionCost(I, ElementCount::getFixed(1)).first;
6348 
6349     // Compute the scalarization overhead of needed insertelement instructions
6350     // and phi nodes.
6351     if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
6352       ScalarCost += TTI.getScalarizationOverhead(
6353           cast<VectorType>(ToVectorTy(I->getType(), VF)),
6354           APInt::getAllOnes(VF.getFixedValue()), true, false);
6355       ScalarCost +=
6356           VF.getFixedValue() *
6357           TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
6358     }
6359 
6360     // Compute the scalarization overhead of needed extractelement
6361     // instructions. For each of the instruction's operands, if the operand can
6362     // be scalarized, add it to the worklist; otherwise, account for the
6363     // overhead.
6364     for (Use &U : I->operands())
6365       if (auto *J = dyn_cast<Instruction>(U.get())) {
6366         assert(VectorType::isValidElementType(J->getType()) &&
6367                "Instruction has non-scalar type");
6368         if (canBeScalarized(J))
6369           Worklist.push_back(J);
6370         else if (needsExtract(J, VF)) {
6371           ScalarCost += TTI.getScalarizationOverhead(
6372               cast<VectorType>(ToVectorTy(J->getType(), VF)),
6373               APInt::getAllOnes(VF.getFixedValue()), false, true);
6374         }
6375       }
6376 
6377     // Scale the total scalar cost by block probability.
6378     ScalarCost /= getReciprocalPredBlockProb();
6379 
6380     // Compute the discount. A non-negative discount means the vector version
6381     // of the instruction costs more, and scalarizing would be beneficial.
6382     Discount += VectorCost - ScalarCost;
6383     ScalarCosts[I] = ScalarCost;
6384   }
6385 
6386   return *Discount.getValue();
6387 }
6388 
6389 LoopVectorizationCostModel::VectorizationCostTy
6390 LoopVectorizationCostModel::expectedCost(
6391     ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
6392   VectorizationCostTy Cost;
6393 
6394   // For each block.
6395   for (BasicBlock *BB : TheLoop->blocks()) {
6396     VectorizationCostTy BlockCost;
6397 
6398     // For each instruction in the old loop.
6399     for (Instruction &I : BB->instructionsWithoutDebug()) {
6400       // Skip ignored values.
6401       if (ValuesToIgnore.count(&I) ||
6402           (VF.isVector() && VecValuesToIgnore.count(&I)))
6403         continue;
6404 
6405       VectorizationCostTy C = getInstructionCost(&I, VF);
6406 
6407       // Check if we should override the cost.
6408       if (C.first.isValid() &&
6409           ForceTargetInstructionCost.getNumOccurrences() > 0)
6410         C.first = InstructionCost(ForceTargetInstructionCost);
6411 
6412       // Keep a list of instructions with invalid costs.
6413       if (Invalid && !C.first.isValid())
6414         Invalid->emplace_back(&I, VF);
6415 
6416       BlockCost.first += C.first;
6417       BlockCost.second |= C.second;
6418       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6419                         << " for VF " << VF << " For instruction: " << I
6420                         << '\n');
6421     }
6422 
6423     // If we are vectorizing a predicated block, it will have been
6424     // if-converted. This means that the block's instructions (aside from
6425     // stores and instructions that may divide by zero) will now be
6426     // unconditionally executed. For the scalar case, we may not always execute
6427     // the predicated block, if it is an if-else block. Thus, scale the block's
6428     // cost by the probability of executing it. blockNeedsPredication from
6429     // Legal is used so as to not include all blocks in tail folded loops.
6430     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
6431       BlockCost.first /= getReciprocalPredBlockProb();
6432 
6433     Cost.first += BlockCost.first;
6434     Cost.second |= BlockCost.second;
6435   }
6436 
6437   return Cost;
6438 }
6439 
6440 /// Gets Address Access SCEV after verifying that the access pattern
6441 /// is loop invariant except the induction variable dependence.
6442 ///
6443 /// This SCEV can be sent to the Target in order to estimate the address
6444 /// calculation cost.
6445 static const SCEV *getAddressAccessSCEV(
6446               Value *Ptr,
6447               LoopVectorizationLegality *Legal,
6448               PredicatedScalarEvolution &PSE,
6449               const Loop *TheLoop) {
6450 
6451   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6452   if (!Gep)
6453     return nullptr;
6454 
6455   // We are looking for a gep with all loop invariant indices except for one
6456   // which should be an induction variable.
6457   auto SE = PSE.getSE();
6458   unsigned NumOperands = Gep->getNumOperands();
6459   for (unsigned i = 1; i < NumOperands; ++i) {
6460     Value *Opd = Gep->getOperand(i);
6461     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6462         !Legal->isInductionVariable(Opd))
6463       return nullptr;
6464   }
6465 
6466   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6467   return PSE.getSCEV(Ptr);
6468 }
6469 
6470 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6471   return Legal->hasStride(I->getOperand(0)) ||
6472          Legal->hasStride(I->getOperand(1));
6473 }
6474 
6475 InstructionCost
6476 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6477                                                         ElementCount VF) {
6478   assert(VF.isVector() &&
6479          "Scalarization cost of instruction implies vectorization.");
6480   if (VF.isScalable())
6481     return InstructionCost::getInvalid();
6482 
6483   Type *ValTy = getLoadStoreType(I);
6484   auto SE = PSE.getSE();
6485 
6486   unsigned AS = getLoadStoreAddressSpace(I);
6487   Value *Ptr = getLoadStorePointerOperand(I);
6488   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6489   // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
6490   //       that it is being called from this specific place.
6491 
6492   // Figure out whether the access is strided and get the stride value
6493   // if it's known in compile time
6494   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6495 
6496   // Get the cost of the scalar memory instruction and address computation.
6497   InstructionCost Cost =
6498       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6499 
6500   // Don't pass *I here, since it is scalar but will actually be part of a
6501   // vectorized loop where the user of it is a vectorized instruction.
6502   const Align Alignment = getLoadStoreAlignment(I);
6503   Cost += VF.getKnownMinValue() *
6504           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
6505                               AS, TTI::TCK_RecipThroughput);
6506 
6507   // Get the overhead of the extractelement and insertelement instructions
6508   // we might create due to scalarization.
6509   Cost += getScalarizationOverhead(I, VF);
6510 
6511   // If we have a predicated load/store, it will need extra i1 extracts and
6512   // conditional branches, but may not be executed for each vector lane. Scale
6513   // the cost by the probability of executing the predicated block.
6514   if (isPredicatedInst(I, VF)) {
6515     Cost /= getReciprocalPredBlockProb();
6516 
6517     // Add the cost of an i1 extract and a branch
6518     auto *Vec_i1Ty =
6519         VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
6520     Cost += TTI.getScalarizationOverhead(
6521         Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
6522         /*Insert=*/false, /*Extract=*/true);
6523     Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput);
6524 
6525     if (useEmulatedMaskMemRefHack(I, VF))
6526       // Artificially setting to a high enough value to practically disable
6527       // vectorization with such operations.
6528       Cost = 3000000;
6529   }
6530 
6531   return Cost;
6532 }
6533 
6534 InstructionCost
6535 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6536                                                     ElementCount VF) {
6537   Type *ValTy = getLoadStoreType(I);
6538   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6539   Value *Ptr = getLoadStorePointerOperand(I);
6540   unsigned AS = getLoadStoreAddressSpace(I);
6541   int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
6542   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6543 
6544   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6545          "Stride should be 1 or -1 for consecutive memory access");
6546   const Align Alignment = getLoadStoreAlignment(I);
6547   InstructionCost Cost = 0;
6548   if (Legal->isMaskRequired(I))
6549     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6550                                       CostKind);
6551   else
6552     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6553                                 CostKind, I);
6554 
6555   bool Reverse = ConsecutiveStride < 0;
6556   if (Reverse)
6557     Cost +=
6558         TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
6559   return Cost;
6560 }
6561 
6562 InstructionCost
6563 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6564                                                 ElementCount VF) {
6565   assert(Legal->isUniformMemOp(*I));
6566 
6567   Type *ValTy = getLoadStoreType(I);
6568   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6569   const Align Alignment = getLoadStoreAlignment(I);
6570   unsigned AS = getLoadStoreAddressSpace(I);
6571   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6572   if (isa<LoadInst>(I)) {
6573     return TTI.getAddressComputationCost(ValTy) +
6574            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6575                                CostKind) +
6576            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6577   }
6578   StoreInst *SI = cast<StoreInst>(I);
6579 
6580   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6581   return TTI.getAddressComputationCost(ValTy) +
6582          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6583                              CostKind) +
6584          (isLoopInvariantStoreValue
6585               ? 0
6586               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6587                                        VF.getKnownMinValue() - 1));
6588 }
6589 
6590 InstructionCost
6591 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6592                                                  ElementCount VF) {
6593   Type *ValTy = getLoadStoreType(I);
6594   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6595   const Align Alignment = getLoadStoreAlignment(I);
6596   const Value *Ptr = getLoadStorePointerOperand(I);
6597 
6598   return TTI.getAddressComputationCost(VectorTy) +
6599          TTI.getGatherScatterOpCost(
6600              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6601              TargetTransformInfo::TCK_RecipThroughput, I);
6602 }
6603 
6604 InstructionCost
6605 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6606                                                    ElementCount VF) {
6607   // TODO: Once we have support for interleaving with scalable vectors
6608   // we can calculate the cost properly here.
6609   if (VF.isScalable())
6610     return InstructionCost::getInvalid();
6611 
6612   Type *ValTy = getLoadStoreType(I);
6613   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6614   unsigned AS = getLoadStoreAddressSpace(I);
6615 
6616   auto Group = getInterleavedAccessGroup(I);
6617   assert(Group && "Fail to get an interleaved access group.");
6618 
6619   unsigned InterleaveFactor = Group->getFactor();
6620   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6621 
6622   // Holds the indices of existing members in the interleaved group.
6623   SmallVector<unsigned, 4> Indices;
6624   for (unsigned IF = 0; IF < InterleaveFactor; IF++)
6625     if (Group->getMember(IF))
6626       Indices.push_back(IF);
6627 
6628   // Calculate the cost of the whole interleaved group.
6629   bool UseMaskForGaps =
6630       (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
6631       (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
6632   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
6633       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6634       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
6635 
6636   if (Group->isReverse()) {
6637     // TODO: Add support for reversed masked interleaved access.
6638     assert(!Legal->isMaskRequired(I) &&
6639            "Reverse masked interleaved access not supported.");
6640     Cost +=
6641         Group->getNumMembers() *
6642         TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
6643   }
6644   return Cost;
6645 }
6646 
6647 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(
6648     Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) {
6649   using namespace llvm::PatternMatch;
6650   // Early exit for no inloop reductions
6651   if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty))
6652     return None;
6653   auto *VectorTy = cast<VectorType>(Ty);
6654 
6655   // We are looking for a pattern of, and finding the minimal acceptable cost:
6656   //  reduce(mul(ext(A), ext(B))) or
6657   //  reduce(mul(A, B)) or
6658   //  reduce(ext(A)) or
6659   //  reduce(A).
6660   // The basic idea is that we walk down the tree to do that, finding the root
6661   // reduction instruction in InLoopReductionImmediateChains. From there we find
6662   // the pattern of mul/ext and test the cost of the entire pattern vs the cost
6663   // of the components. If the reduction cost is lower then we return it for the
6664   // reduction instruction and 0 for the other instructions in the pattern. If
6665   // it is not we return an invalid cost specifying the orignal cost method
6666   // should be used.
6667   Instruction *RetI = I;
6668   if (match(RetI, m_ZExtOrSExt(m_Value()))) {
6669     if (!RetI->hasOneUser())
6670       return None;
6671     RetI = RetI->user_back();
6672   }
6673   if (match(RetI, m_Mul(m_Value(), m_Value())) &&
6674       RetI->user_back()->getOpcode() == Instruction::Add) {
6675     if (!RetI->hasOneUser())
6676       return None;
6677     RetI = RetI->user_back();
6678   }
6679 
6680   // Test if the found instruction is a reduction, and if not return an invalid
6681   // cost specifying the parent to use the original cost modelling.
6682   if (!InLoopReductionImmediateChains.count(RetI))
6683     return None;
6684 
6685   // Find the reduction this chain is a part of and calculate the basic cost of
6686   // the reduction on its own.
6687   Instruction *LastChain = InLoopReductionImmediateChains[RetI];
6688   Instruction *ReductionPhi = LastChain;
6689   while (!isa<PHINode>(ReductionPhi))
6690     ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];
6691 
6692   const RecurrenceDescriptor &RdxDesc =
6693       Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
6694 
6695   InstructionCost BaseCost = TTI.getArithmeticReductionCost(
6696       RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
6697 
6698   // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
6699   // normal fmul instruction to the cost of the fadd reduction.
6700   if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)
6701     BaseCost +=
6702         TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
6703 
6704   // If we're using ordered reductions then we can just return the base cost
6705   // here, since getArithmeticReductionCost calculates the full ordered
6706   // reduction cost when FP reassociation is not allowed.
6707   if (useOrderedReductions(RdxDesc))
6708     return BaseCost;
6709 
6710   // Get the operand that was not the reduction chain and match it to one of the
6711   // patterns, returning the better cost if it is found.
6712   Instruction *RedOp = RetI->getOperand(1) == LastChain
6713                            ? dyn_cast<Instruction>(RetI->getOperand(0))
6714                            : dyn_cast<Instruction>(RetI->getOperand(1));
6715 
6716   VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
6717 
6718   Instruction *Op0, *Op1;
6719   if (RedOp &&
6720       match(RedOp,
6721             m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
6722       match(Op0, m_ZExtOrSExt(m_Value())) &&
6723       Op0->getOpcode() == Op1->getOpcode() &&
6724       Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
6725       !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
6726       (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
6727 
6728     // Matched reduce(ext(mul(ext(A), ext(B)))
6729     // Note that the extend opcodes need to all match, or if A==B they will have
6730     // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
6731     // which is equally fine.
6732     bool IsUnsigned = isa<ZExtInst>(Op0);
6733     auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
6734     auto *MulType = VectorType::get(Op0->getType(), VectorTy);
6735 
6736     InstructionCost ExtCost =
6737         TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
6738                              TTI::CastContextHint::None, CostKind, Op0);
6739     InstructionCost MulCost =
6740         TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
6741     InstructionCost Ext2Cost =
6742         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
6743                              TTI::CastContextHint::None, CostKind, RedOp);
6744 
6745     InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6746         /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6747         CostKind);
6748 
6749     if (RedCost.isValid() &&
6750         RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
6751       return I == RetI ? RedCost : 0;
6752   } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
6753              !TheLoop->isLoopInvariant(RedOp)) {
6754     // Matched reduce(ext(A))
6755     bool IsUnsigned = isa<ZExtInst>(RedOp);
6756     auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
6757     InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6758         /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6759         CostKind);
6760 
6761     InstructionCost ExtCost =
6762         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
6763                              TTI::CastContextHint::None, CostKind, RedOp);
6764     if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
6765       return I == RetI ? RedCost : 0;
6766   } else if (RedOp &&
6767              match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
6768     if (match(Op0, m_ZExtOrSExt(m_Value())) &&
6769         Op0->getOpcode() == Op1->getOpcode() &&
6770         !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
6771       bool IsUnsigned = isa<ZExtInst>(Op0);
6772       Type *Op0Ty = Op0->getOperand(0)->getType();
6773       Type *Op1Ty = Op1->getOperand(0)->getType();
6774       Type *LargestOpTy =
6775           Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
6776                                                                     : Op0Ty;
6777       auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
6778 
6779       // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of
6780       // different sizes. We take the largest type as the ext to reduce, and add
6781       // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
6782       InstructionCost ExtCost0 = TTI.getCastInstrCost(
6783           Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
6784           TTI::CastContextHint::None, CostKind, Op0);
6785       InstructionCost ExtCost1 = TTI.getCastInstrCost(
6786           Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
6787           TTI::CastContextHint::None, CostKind, Op1);
6788       InstructionCost MulCost =
6789           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6790 
6791       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6792           /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6793           CostKind);
6794       InstructionCost ExtraExtCost = 0;
6795       if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
6796         Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
6797         ExtraExtCost = TTI.getCastInstrCost(
6798             ExtraExtOp->getOpcode(), ExtType,
6799             VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
6800             TTI::CastContextHint::None, CostKind, ExtraExtOp);
6801       }
6802 
6803       if (RedCost.isValid() &&
6804           (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
6805         return I == RetI ? RedCost : 0;
6806     } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
6807       // Matched reduce(mul())
6808       InstructionCost MulCost =
6809           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6810 
6811       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6812           /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy,
6813           CostKind);
6814 
6815       if (RedCost.isValid() && RedCost < MulCost + BaseCost)
6816         return I == RetI ? RedCost : 0;
6817     }
6818   }
6819 
6820   return I == RetI ? Optional<InstructionCost>(BaseCost) : None;
6821 }
6822 
6823 InstructionCost
6824 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6825                                                      ElementCount VF) {
6826   // Calculate scalar cost only. Vectorization cost should be ready at this
6827   // moment.
6828   if (VF.isScalar()) {
6829     Type *ValTy = getLoadStoreType(I);
6830     const Align Alignment = getLoadStoreAlignment(I);
6831     unsigned AS = getLoadStoreAddressSpace(I);
6832 
6833     return TTI.getAddressComputationCost(ValTy) +
6834            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6835                                TTI::TCK_RecipThroughput, I);
6836   }
6837   return getWideningCost(I, VF);
6838 }
6839 
6840 LoopVectorizationCostModel::VectorizationCostTy
6841 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6842                                                ElementCount VF) {
6843   // If we know that this instruction will remain uniform, check the cost of
6844   // the scalar version.
6845   if (isUniformAfterVectorization(I, VF))
6846     VF = ElementCount::getFixed(1);
6847 
6848   if (VF.isVector() && isProfitableToScalarize(I, VF))
6849     return VectorizationCostTy(InstsToScalarize[VF][I], false);
6850 
6851   // Forced scalars do not have any scalarization overhead.
6852   auto ForcedScalar = ForcedScalars.find(VF);
6853   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6854     auto InstSet = ForcedScalar->second;
6855     if (InstSet.count(I))
6856       return VectorizationCostTy(
6857           (getInstructionCost(I, ElementCount::getFixed(1)).first *
6858            VF.getKnownMinValue()),
6859           false);
6860   }
6861 
6862   Type *VectorTy;
6863   InstructionCost C = getInstructionCost(I, VF, VectorTy);
6864 
6865   bool TypeNotScalarized = false;
6866   if (VF.isVector() && VectorTy->isVectorTy()) {
6867     unsigned NumParts = TTI.getNumberOfParts(VectorTy);
6868     if (NumParts)
6869       TypeNotScalarized = NumParts < VF.getKnownMinValue();
6870     else
6871       C = InstructionCost::getInvalid();
6872   }
6873   return VectorizationCostTy(C, TypeNotScalarized);
6874 }
6875 
6876 InstructionCost
6877 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
6878                                                      ElementCount VF) const {
6879 
6880   // There is no mechanism yet to create a scalable scalarization loop,
6881   // so this is currently Invalid.
6882   if (VF.isScalable())
6883     return InstructionCost::getInvalid();
6884 
6885   if (VF.isScalar())
6886     return 0;
6887 
6888   InstructionCost Cost = 0;
6889   Type *RetTy = ToVectorTy(I->getType(), VF);
6890   if (!RetTy->isVoidTy() &&
6891       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6892     Cost += TTI.getScalarizationOverhead(
6893         cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true,
6894         false);
6895 
6896   // Some targets keep addresses scalar.
6897   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6898     return Cost;
6899 
6900   // Some targets support efficient element stores.
6901   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6902     return Cost;
6903 
6904   // Collect operands to consider.
6905   CallInst *CI = dyn_cast<CallInst>(I);
6906   Instruction::op_range Ops = CI ? CI->args() : I->operands();
6907 
6908   // Skip operands that do not require extraction/scalarization and do not incur
6909   // any overhead.
6910   SmallVector<Type *> Tys;
6911   for (auto *V : filterExtractingOperands(Ops, VF))
6912     Tys.push_back(MaybeVectorizeType(V->getType(), VF));
6913   return Cost + TTI.getOperandsScalarizationOverhead(
6914                     filterExtractingOperands(Ops, VF), Tys);
6915 }
6916 
6917 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6918   if (VF.isScalar())
6919     return;
6920   NumPredStores = 0;
6921   for (BasicBlock *BB : TheLoop->blocks()) {
6922     // For each instruction in the old loop.
6923     for (Instruction &I : *BB) {
6924       Value *Ptr =  getLoadStorePointerOperand(&I);
6925       if (!Ptr)
6926         continue;
6927 
6928       // TODO: We should generate better code and update the cost model for
6929       // predicated uniform stores. Today they are treated as any other
6930       // predicated store (see added test cases in
6931       // invariant-store-vectorization.ll).
6932       if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
6933         NumPredStores++;
6934 
6935       if (Legal->isUniformMemOp(I)) {
6936         // TODO: Avoid replicating loads and stores instead of
6937         // relying on instcombine to remove them.
6938         // Load: Scalar load + broadcast
6939         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6940         InstructionCost Cost;
6941         if (isa<StoreInst>(&I) && VF.isScalable() &&
6942             isLegalGatherOrScatter(&I, VF)) {
6943           Cost = getGatherScatterCost(&I, VF);
6944           setWideningDecision(&I, VF, CM_GatherScatter, Cost);
6945         } else {
6946           assert((isa<LoadInst>(&I) || !VF.isScalable()) &&
6947                  "Cannot yet scalarize uniform stores");
6948           Cost = getUniformMemOpCost(&I, VF);
6949           setWideningDecision(&I, VF, CM_Scalarize, Cost);
6950         }
6951         continue;
6952       }
6953 
6954       // We assume that widening is the best solution when possible.
6955       if (memoryInstructionCanBeWidened(&I, VF)) {
6956         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6957         int ConsecutiveStride = Legal->isConsecutivePtr(
6958             getLoadStoreType(&I), getLoadStorePointerOperand(&I));
6959         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6960                "Expected consecutive stride.");
6961         InstWidening Decision =
6962             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6963         setWideningDecision(&I, VF, Decision, Cost);
6964         continue;
6965       }
6966 
6967       // Choose between Interleaving, Gather/Scatter or Scalarization.
6968       InstructionCost InterleaveCost = InstructionCost::getInvalid();
6969       unsigned NumAccesses = 1;
6970       if (isAccessInterleaved(&I)) {
6971         auto Group = getInterleavedAccessGroup(&I);
6972         assert(Group && "Fail to get an interleaved access group.");
6973 
6974         // Make one decision for the whole group.
6975         if (getWideningDecision(&I, VF) != CM_Unknown)
6976           continue;
6977 
6978         NumAccesses = Group->getNumMembers();
6979         if (interleavedAccessCanBeWidened(&I, VF))
6980           InterleaveCost = getInterleaveGroupCost(&I, VF);
6981       }
6982 
6983       InstructionCost GatherScatterCost =
6984           isLegalGatherOrScatter(&I, VF)
6985               ? getGatherScatterCost(&I, VF) * NumAccesses
6986               : InstructionCost::getInvalid();
6987 
6988       InstructionCost ScalarizationCost =
6989           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6990 
6991       // Choose better solution for the current VF,
6992       // write down this decision and use it during vectorization.
6993       InstructionCost Cost;
6994       InstWidening Decision;
6995       if (InterleaveCost <= GatherScatterCost &&
6996           InterleaveCost < ScalarizationCost) {
6997         Decision = CM_Interleave;
6998         Cost = InterleaveCost;
6999       } else if (GatherScatterCost < ScalarizationCost) {
7000         Decision = CM_GatherScatter;
7001         Cost = GatherScatterCost;
7002       } else {
7003         Decision = CM_Scalarize;
7004         Cost = ScalarizationCost;
7005       }
7006       // If the instructions belongs to an interleave group, the whole group
7007       // receives the same decision. The whole group receives the cost, but
7008       // the cost will actually be assigned to one instruction.
7009       if (auto Group = getInterleavedAccessGroup(&I))
7010         setWideningDecision(Group, VF, Decision, Cost);
7011       else
7012         setWideningDecision(&I, VF, Decision, Cost);
7013     }
7014   }
7015 
7016   // Make sure that any load of address and any other address computation
7017   // remains scalar unless there is gather/scatter support. This avoids
7018   // inevitable extracts into address registers, and also has the benefit of
7019   // activating LSR more, since that pass can't optimize vectorized
7020   // addresses.
7021   if (TTI.prefersVectorizedAddressing())
7022     return;
7023 
7024   // Start with all scalar pointer uses.
7025   SmallPtrSet<Instruction *, 8> AddrDefs;
7026   for (BasicBlock *BB : TheLoop->blocks())
7027     for (Instruction &I : *BB) {
7028       Instruction *PtrDef =
7029         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
7030       if (PtrDef && TheLoop->contains(PtrDef) &&
7031           getWideningDecision(&I, VF) != CM_GatherScatter)
7032         AddrDefs.insert(PtrDef);
7033     }
7034 
7035   // Add all instructions used to generate the addresses.
7036   SmallVector<Instruction *, 4> Worklist;
7037   append_range(Worklist, AddrDefs);
7038   while (!Worklist.empty()) {
7039     Instruction *I = Worklist.pop_back_val();
7040     for (auto &Op : I->operands())
7041       if (auto *InstOp = dyn_cast<Instruction>(Op))
7042         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
7043             AddrDefs.insert(InstOp).second)
7044           Worklist.push_back(InstOp);
7045   }
7046 
7047   for (auto *I : AddrDefs) {
7048     if (isa<LoadInst>(I)) {
7049       // Setting the desired widening decision should ideally be handled in
7050       // by cost functions, but since this involves the task of finding out
7051       // if the loaded register is involved in an address computation, it is
7052       // instead changed here when we know this is the case.
7053       InstWidening Decision = getWideningDecision(I, VF);
7054       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
7055         // Scalarize a widened load of address.
7056         setWideningDecision(
7057             I, VF, CM_Scalarize,
7058             (VF.getKnownMinValue() *
7059              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
7060       else if (auto Group = getInterleavedAccessGroup(I)) {
7061         // Scalarize an interleave group of address loads.
7062         for (unsigned I = 0; I < Group->getFactor(); ++I) {
7063           if (Instruction *Member = Group->getMember(I))
7064             setWideningDecision(
7065                 Member, VF, CM_Scalarize,
7066                 (VF.getKnownMinValue() *
7067                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
7068         }
7069       }
7070     } else
7071       // Make sure I gets scalarized and a cost estimate without
7072       // scalarization overhead.
7073       ForcedScalars[VF].insert(I);
7074   }
7075 }
7076 
7077 InstructionCost
7078 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
7079                                                Type *&VectorTy) {
7080   Type *RetTy = I->getType();
7081   if (canTruncateToMinimalBitwidth(I, VF))
7082     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
7083   auto SE = PSE.getSE();
7084   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7085 
7086   auto hasSingleCopyAfterVectorization = [this](Instruction *I,
7087                                                 ElementCount VF) -> bool {
7088     if (VF.isScalar())
7089       return true;
7090 
7091     auto Scalarized = InstsToScalarize.find(VF);
7092     assert(Scalarized != InstsToScalarize.end() &&
7093            "VF not yet analyzed for scalarization profitability");
7094     return !Scalarized->second.count(I) &&
7095            llvm::all_of(I->users(), [&](User *U) {
7096              auto *UI = cast<Instruction>(U);
7097              return !Scalarized->second.count(UI);
7098            });
7099   };
7100   (void) hasSingleCopyAfterVectorization;
7101 
7102   if (isScalarAfterVectorization(I, VF)) {
7103     // With the exception of GEPs and PHIs, after scalarization there should
7104     // only be one copy of the instruction generated in the loop. This is
7105     // because the VF is either 1, or any instructions that need scalarizing
7106     // have already been dealt with by the the time we get here. As a result,
7107     // it means we don't have to multiply the instruction cost by VF.
7108     assert(I->getOpcode() == Instruction::GetElementPtr ||
7109            I->getOpcode() == Instruction::PHI ||
7110            (I->getOpcode() == Instruction::BitCast &&
7111             I->getType()->isPointerTy()) ||
7112            hasSingleCopyAfterVectorization(I, VF));
7113     VectorTy = RetTy;
7114   } else
7115     VectorTy = ToVectorTy(RetTy, VF);
7116 
7117   // TODO: We need to estimate the cost of intrinsic calls.
7118   switch (I->getOpcode()) {
7119   case Instruction::GetElementPtr:
7120     // We mark this instruction as zero-cost because the cost of GEPs in
7121     // vectorized code depends on whether the corresponding memory instruction
7122     // is scalarized or not. Therefore, we handle GEPs with the memory
7123     // instruction cost.
7124     return 0;
7125   case Instruction::Br: {
7126     // In cases of scalarized and predicated instructions, there will be VF
7127     // predicated blocks in the vectorized loop. Each branch around these
7128     // blocks requires also an extract of its vector compare i1 element.
7129     bool ScalarPredicatedBB = false;
7130     BranchInst *BI = cast<BranchInst>(I);
7131     if (VF.isVector() && BI->isConditional() &&
7132         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
7133          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
7134       ScalarPredicatedBB = true;
7135 
7136     if (ScalarPredicatedBB) {
7137       // Not possible to scalarize scalable vector with predicated instructions.
7138       if (VF.isScalable())
7139         return InstructionCost::getInvalid();
7140       // Return cost for branches around scalarized and predicated blocks.
7141       auto *Vec_i1Ty =
7142           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
7143       return (
7144           TTI.getScalarizationOverhead(
7145               Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) +
7146           (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
7147     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
7148       // The back-edge branch will remain, as will all scalar branches.
7149       return TTI.getCFInstrCost(Instruction::Br, CostKind);
7150     else
7151       // This branch will be eliminated by if-conversion.
7152       return 0;
7153     // Note: We currently assume zero cost for an unconditional branch inside
7154     // a predicated block since it will become a fall-through, although we
7155     // may decide in the future to call TTI for all branches.
7156   }
7157   case Instruction::PHI: {
7158     auto *Phi = cast<PHINode>(I);
7159 
7160     // First-order recurrences are replaced by vector shuffles inside the loop.
7161     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
7162     if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
7163       return TTI.getShuffleCost(
7164           TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
7165           None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
7166 
7167     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
7168     // converted into select instructions. We require N - 1 selects per phi
7169     // node, where N is the number of incoming values.
7170     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
7171       return (Phi->getNumIncomingValues() - 1) *
7172              TTI.getCmpSelInstrCost(
7173                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
7174                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
7175                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
7176 
7177     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
7178   }
7179   case Instruction::UDiv:
7180   case Instruction::SDiv:
7181   case Instruction::URem:
7182   case Instruction::SRem:
7183     // If we have a predicated instruction, it may not be executed for each
7184     // vector lane. Get the scalarization cost and scale this amount by the
7185     // probability of executing the predicated block. If the instruction is not
7186     // predicated, we fall through to the next case.
7187     if (VF.isVector() && isScalarWithPredication(I, VF)) {
7188       InstructionCost Cost = 0;
7189 
7190       // These instructions have a non-void type, so account for the phi nodes
7191       // that we will create. This cost is likely to be zero. The phi node
7192       // cost, if any, should be scaled by the block probability because it
7193       // models a copy at the end of each predicated block.
7194       Cost += VF.getKnownMinValue() *
7195               TTI.getCFInstrCost(Instruction::PHI, CostKind);
7196 
7197       // The cost of the non-predicated instruction.
7198       Cost += VF.getKnownMinValue() *
7199               TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
7200 
7201       // The cost of insertelement and extractelement instructions needed for
7202       // scalarization.
7203       Cost += getScalarizationOverhead(I, VF);
7204 
7205       // Scale the cost by the probability of executing the predicated blocks.
7206       // This assumes the predicated block for each vector lane is equally
7207       // likely.
7208       return Cost / getReciprocalPredBlockProb();
7209     }
7210     LLVM_FALLTHROUGH;
7211   case Instruction::Add:
7212   case Instruction::FAdd:
7213   case Instruction::Sub:
7214   case Instruction::FSub:
7215   case Instruction::Mul:
7216   case Instruction::FMul:
7217   case Instruction::FDiv:
7218   case Instruction::FRem:
7219   case Instruction::Shl:
7220   case Instruction::LShr:
7221   case Instruction::AShr:
7222   case Instruction::And:
7223   case Instruction::Or:
7224   case Instruction::Xor: {
7225     // Since we will replace the stride by 1 the multiplication should go away.
7226     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7227       return 0;
7228 
7229     // Detect reduction patterns
7230     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7231       return *RedCost;
7232 
7233     // Certain instructions can be cheaper to vectorize if they have a constant
7234     // second vector operand. One example of this are shifts on x86.
7235     Value *Op2 = I->getOperand(1);
7236     TargetTransformInfo::OperandValueProperties Op2VP;
7237     TargetTransformInfo::OperandValueKind Op2VK =
7238         TTI.getOperandInfo(Op2, Op2VP);
7239     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
7240       Op2VK = TargetTransformInfo::OK_UniformValue;
7241 
7242     SmallVector<const Value *, 4> Operands(I->operand_values());
7243     return TTI.getArithmeticInstrCost(
7244         I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7245         Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
7246   }
7247   case Instruction::FNeg: {
7248     return TTI.getArithmeticInstrCost(
7249         I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7250         TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None,
7251         TargetTransformInfo::OP_None, I->getOperand(0), I);
7252   }
7253   case Instruction::Select: {
7254     SelectInst *SI = cast<SelectInst>(I);
7255     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7256     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7257 
7258     const Value *Op0, *Op1;
7259     using namespace llvm::PatternMatch;
7260     if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
7261                         match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
7262       // select x, y, false --> x & y
7263       // select x, true, y --> x | y
7264       TTI::OperandValueProperties Op1VP = TTI::OP_None;
7265       TTI::OperandValueProperties Op2VP = TTI::OP_None;
7266       TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP);
7267       TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP);
7268       assert(Op0->getType()->getScalarSizeInBits() == 1 &&
7269               Op1->getType()->getScalarSizeInBits() == 1);
7270 
7271       SmallVector<const Value *, 2> Operands{Op0, Op1};
7272       return TTI.getArithmeticInstrCost(
7273           match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
7274           CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I);
7275     }
7276 
7277     Type *CondTy = SI->getCondition()->getType();
7278     if (!ScalarCond)
7279       CondTy = VectorType::get(CondTy, VF);
7280 
7281     CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
7282     if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
7283       Pred = Cmp->getPredicate();
7284     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
7285                                   CostKind, I);
7286   }
7287   case Instruction::ICmp:
7288   case Instruction::FCmp: {
7289     Type *ValTy = I->getOperand(0)->getType();
7290     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7291     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7292       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7293     VectorTy = ToVectorTy(ValTy, VF);
7294     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7295                                   cast<CmpInst>(I)->getPredicate(), CostKind,
7296                                   I);
7297   }
7298   case Instruction::Store:
7299   case Instruction::Load: {
7300     ElementCount Width = VF;
7301     if (Width.isVector()) {
7302       InstWidening Decision = getWideningDecision(I, Width);
7303       assert(Decision != CM_Unknown &&
7304              "CM decision should be taken at this point");
7305       if (Decision == CM_Scalarize)
7306         Width = ElementCount::getFixed(1);
7307     }
7308     VectorTy = ToVectorTy(getLoadStoreType(I), Width);
7309     return getMemoryInstructionCost(I, VF);
7310   }
7311   case Instruction::BitCast:
7312     if (I->getType()->isPointerTy())
7313       return 0;
7314     LLVM_FALLTHROUGH;
7315   case Instruction::ZExt:
7316   case Instruction::SExt:
7317   case Instruction::FPToUI:
7318   case Instruction::FPToSI:
7319   case Instruction::FPExt:
7320   case Instruction::PtrToInt:
7321   case Instruction::IntToPtr:
7322   case Instruction::SIToFP:
7323   case Instruction::UIToFP:
7324   case Instruction::Trunc:
7325   case Instruction::FPTrunc: {
7326     // Computes the CastContextHint from a Load/Store instruction.
7327     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7328       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7329              "Expected a load or a store!");
7330 
7331       if (VF.isScalar() || !TheLoop->contains(I))
7332         return TTI::CastContextHint::Normal;
7333 
7334       switch (getWideningDecision(I, VF)) {
7335       case LoopVectorizationCostModel::CM_GatherScatter:
7336         return TTI::CastContextHint::GatherScatter;
7337       case LoopVectorizationCostModel::CM_Interleave:
7338         return TTI::CastContextHint::Interleave;
7339       case LoopVectorizationCostModel::CM_Scalarize:
7340       case LoopVectorizationCostModel::CM_Widen:
7341         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7342                                         : TTI::CastContextHint::Normal;
7343       case LoopVectorizationCostModel::CM_Widen_Reverse:
7344         return TTI::CastContextHint::Reversed;
7345       case LoopVectorizationCostModel::CM_Unknown:
7346         llvm_unreachable("Instr did not go through cost modelling?");
7347       }
7348 
7349       llvm_unreachable("Unhandled case!");
7350     };
7351 
7352     unsigned Opcode = I->getOpcode();
7353     TTI::CastContextHint CCH = TTI::CastContextHint::None;
7354     // For Trunc, the context is the only user, which must be a StoreInst.
7355     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7356       if (I->hasOneUse())
7357         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7358           CCH = ComputeCCH(Store);
7359     }
7360     // For Z/Sext, the context is the operand, which must be a LoadInst.
7361     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7362              Opcode == Instruction::FPExt) {
7363       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7364         CCH = ComputeCCH(Load);
7365     }
7366 
7367     // We optimize the truncation of induction variables having constant
7368     // integer steps. The cost of these truncations is the same as the scalar
7369     // operation.
7370     if (isOptimizableIVTruncate(I, VF)) {
7371       auto *Trunc = cast<TruncInst>(I);
7372       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7373                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
7374     }
7375 
7376     // Detect reduction patterns
7377     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7378       return *RedCost;
7379 
7380     Type *SrcScalarTy = I->getOperand(0)->getType();
7381     Type *SrcVecTy =
7382         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7383     if (canTruncateToMinimalBitwidth(I, VF)) {
7384       // This cast is going to be shrunk. This may remove the cast or it might
7385       // turn it into slightly different cast. For example, if MinBW == 16,
7386       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7387       //
7388       // Calculate the modified src and dest types.
7389       Type *MinVecTy = VectorTy;
7390       if (Opcode == Instruction::Trunc) {
7391         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7392         VectorTy =
7393             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7394       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7395         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7396         VectorTy =
7397             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7398       }
7399     }
7400 
7401     return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7402   }
7403   case Instruction::Call: {
7404     if (RecurrenceDescriptor::isFMulAddIntrinsic(I))
7405       if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7406         return *RedCost;
7407     bool NeedToScalarize;
7408     CallInst *CI = cast<CallInst>(I);
7409     InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
7410     if (getVectorIntrinsicIDForCall(CI, TLI)) {
7411       InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
7412       return std::min(CallCost, IntrinsicCost);
7413     }
7414     return CallCost;
7415   }
7416   case Instruction::ExtractValue:
7417     return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7418   case Instruction::Alloca:
7419     // We cannot easily widen alloca to a scalable alloca, as
7420     // the result would need to be a vector of pointers.
7421     if (VF.isScalable())
7422       return InstructionCost::getInvalid();
7423     LLVM_FALLTHROUGH;
7424   default:
7425     // This opcode is unknown. Assume that it is the same as 'mul'.
7426     return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7427   } // end of switch.
7428 }
7429 
7430 char LoopVectorize::ID = 0;
7431 
7432 static const char lv_name[] = "Loop Vectorization";
7433 
7434 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
7435 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
7436 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
7437 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
7438 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
7439 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
7440 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
7441 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
7442 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
7443 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
7444 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
7445 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
7446 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
7447 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
7448 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
7449 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
7450 
7451 namespace llvm {
7452 
7453 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
7454 
7455 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
7456                               bool VectorizeOnlyWhenForced) {
7457   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
7458 }
7459 
7460 } // end namespace llvm
7461 
7462 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
7463   // Check if the pointer operand of a load or store instruction is
7464   // consecutive.
7465   if (auto *Ptr = getLoadStorePointerOperand(Inst))
7466     return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr);
7467   return false;
7468 }
7469 
7470 void LoopVectorizationCostModel::collectValuesToIgnore() {
7471   // Ignore ephemeral values.
7472   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7473 
7474   // Ignore type-promoting instructions we identified during reduction
7475   // detection.
7476   for (auto &Reduction : Legal->getReductionVars()) {
7477     const RecurrenceDescriptor &RedDes = Reduction.second;
7478     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7479     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7480   }
7481   // Ignore type-casting instructions we identified during induction
7482   // detection.
7483   for (auto &Induction : Legal->getInductionVars()) {
7484     const InductionDescriptor &IndDes = Induction.second;
7485     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7486     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7487   }
7488 }
7489 
7490 void LoopVectorizationCostModel::collectInLoopReductions() {
7491   for (auto &Reduction : Legal->getReductionVars()) {
7492     PHINode *Phi = Reduction.first;
7493     const RecurrenceDescriptor &RdxDesc = Reduction.second;
7494 
7495     // We don't collect reductions that are type promoted (yet).
7496     if (RdxDesc.getRecurrenceType() != Phi->getType())
7497       continue;
7498 
7499     // If the target would prefer this reduction to happen "in-loop", then we
7500     // want to record it as such.
7501     unsigned Opcode = RdxDesc.getOpcode();
7502     if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7503         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7504                                    TargetTransformInfo::ReductionFlags()))
7505       continue;
7506 
7507     // Check that we can correctly put the reductions into the loop, by
7508     // finding the chain of operations that leads from the phi to the loop
7509     // exit value.
7510     SmallVector<Instruction *, 4> ReductionOperations =
7511         RdxDesc.getReductionOpChain(Phi, TheLoop);
7512     bool InLoop = !ReductionOperations.empty();
7513     if (InLoop) {
7514       InLoopReductionChains[Phi] = ReductionOperations;
7515       // Add the elements to InLoopReductionImmediateChains for cost modelling.
7516       Instruction *LastChain = Phi;
7517       for (auto *I : ReductionOperations) {
7518         InLoopReductionImmediateChains[I] = LastChain;
7519         LastChain = I;
7520       }
7521     }
7522     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7523                       << " reduction for phi: " << *Phi << "\n");
7524   }
7525 }
7526 
7527 // TODO: we could return a pair of values that specify the max VF and
7528 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7529 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7530 // doesn't have a cost model that can choose which plan to execute if
7531 // more than one is generated.
7532 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
7533                                  LoopVectorizationCostModel &CM) {
7534   unsigned WidestType;
7535   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7536   return WidestVectorRegBits / WidestType;
7537 }
7538 
7539 VectorizationFactor
7540 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7541   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
7542   ElementCount VF = UserVF;
7543   // Outer loop handling: They may require CFG and instruction level
7544   // transformations before even evaluating whether vectorization is profitable.
7545   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7546   // the vectorization pipeline.
7547   if (!OrigLoop->isInnermost()) {
7548     // If the user doesn't provide a vectorization factor, determine a
7549     // reasonable one.
7550     if (UserVF.isZero()) {
7551       VF = ElementCount::getFixed(determineVPlanVF(
7552           TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
7553               .getFixedSize(),
7554           CM));
7555       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7556 
7557       // Make sure we have a VF > 1 for stress testing.
7558       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7559         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7560                           << "overriding computed VF.\n");
7561         VF = ElementCount::getFixed(4);
7562       }
7563     }
7564     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7565     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7566            "VF needs to be a power of two");
7567     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7568                       << "VF " << VF << " to build VPlans.\n");
7569     buildVPlans(VF, VF);
7570 
7571     // For VPlan build stress testing, we bail out after VPlan construction.
7572     if (VPlanBuildStressTest)
7573       return VectorizationFactor::Disabled();
7574 
7575     return {VF, 0 /*Cost*/};
7576   }
7577 
7578   LLVM_DEBUG(
7579       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7580                 "VPlan-native path.\n");
7581   return VectorizationFactor::Disabled();
7582 }
7583 
7584 Optional<VectorizationFactor>
7585 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7586   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7587   FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7588   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7589     return None;
7590 
7591   // Invalidate interleave groups if all blocks of loop will be predicated.
7592   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7593       !useMaskedInterleavedAccesses(*TTI)) {
7594     LLVM_DEBUG(
7595         dbgs()
7596         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7597            "which requires masked-interleaved support.\n");
7598     if (CM.InterleaveInfo.invalidateGroups())
7599       // Invalidating interleave groups also requires invalidating all decisions
7600       // based on them, which includes widening decisions and uniform and scalar
7601       // values.
7602       CM.invalidateCostModelingDecisions();
7603   }
7604 
7605   ElementCount MaxUserVF =
7606       UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7607   bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
7608   if (!UserVF.isZero() && UserVFIsLegal) {
7609     assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
7610            "VF needs to be a power of two");
7611     // Collect the instructions (and their associated costs) that will be more
7612     // profitable to scalarize.
7613     if (CM.selectUserVectorizationFactor(UserVF)) {
7614       LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7615       CM.collectInLoopReductions();
7616       buildVPlansWithVPRecipes(UserVF, UserVF);
7617       LLVM_DEBUG(printPlans(dbgs()));
7618       return {{UserVF, 0}};
7619     } else
7620       reportVectorizationInfo("UserVF ignored because of invalid costs.",
7621                               "InvalidCost", ORE, OrigLoop);
7622   }
7623 
7624   // Populate the set of Vectorization Factor Candidates.
7625   ElementCountSet VFCandidates;
7626   for (auto VF = ElementCount::getFixed(1);
7627        ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7628     VFCandidates.insert(VF);
7629   for (auto VF = ElementCount::getScalable(1);
7630        ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7631     VFCandidates.insert(VF);
7632 
7633   for (const auto &VF : VFCandidates) {
7634     // Collect Uniform and Scalar instructions after vectorization with VF.
7635     CM.collectUniformsAndScalars(VF);
7636 
7637     // Collect the instructions (and their associated costs) that will be more
7638     // profitable to scalarize.
7639     if (VF.isVector())
7640       CM.collectInstsToScalarize(VF);
7641   }
7642 
7643   CM.collectInLoopReductions();
7644   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7645   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7646 
7647   LLVM_DEBUG(printPlans(dbgs()));
7648   if (!MaxFactors.hasVector())
7649     return VectorizationFactor::Disabled();
7650 
7651   // Select the optimal vectorization factor.
7652   auto SelectedVF = CM.selectVectorizationFactor(VFCandidates);
7653 
7654   // Check if it is profitable to vectorize with runtime checks.
7655   unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks();
7656   if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) {
7657     bool PragmaThresholdReached =
7658         NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
7659     bool ThresholdReached =
7660         NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
7661     if ((ThresholdReached && !Hints.allowReordering()) ||
7662         PragmaThresholdReached) {
7663       ORE->emit([&]() {
7664         return OptimizationRemarkAnalysisAliasing(
7665                    DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(),
7666                    OrigLoop->getHeader())
7667                << "loop not vectorized: cannot prove it is safe to reorder "
7668                   "memory operations";
7669       });
7670       LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
7671       Hints.emitRemarkWithHints();
7672       return VectorizationFactor::Disabled();
7673     }
7674   }
7675   return SelectedVF;
7676 }
7677 
7678 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
7679   assert(count_if(VPlans,
7680                   [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
7681              1 &&
7682          "Best VF has not a single VPlan.");
7683 
7684   for (const VPlanPtr &Plan : VPlans) {
7685     if (Plan->hasVF(VF))
7686       return *Plan.get();
7687   }
7688   llvm_unreachable("No plan found!");
7689 }
7690 
7691 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7692   SmallVector<Metadata *, 4> MDs;
7693   // Reserve first location for self reference to the LoopID metadata node.
7694   MDs.push_back(nullptr);
7695   bool IsUnrollMetadata = false;
7696   MDNode *LoopID = L->getLoopID();
7697   if (LoopID) {
7698     // First find existing loop unrolling disable metadata.
7699     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7700       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7701       if (MD) {
7702         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7703         IsUnrollMetadata =
7704             S && S->getString().startswith("llvm.loop.unroll.disable");
7705       }
7706       MDs.push_back(LoopID->getOperand(i));
7707     }
7708   }
7709 
7710   if (!IsUnrollMetadata) {
7711     // Add runtime unroll disable metadata.
7712     LLVMContext &Context = L->getHeader()->getContext();
7713     SmallVector<Metadata *, 1> DisableOperands;
7714     DisableOperands.push_back(
7715         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7716     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7717     MDs.push_back(DisableNode);
7718     MDNode *NewLoopID = MDNode::get(Context, MDs);
7719     // Set operand 0 to refer to the loop id itself.
7720     NewLoopID->replaceOperandWith(0, NewLoopID);
7721     L->setLoopID(NewLoopID);
7722   }
7723 }
7724 
7725 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
7726                                            VPlan &BestVPlan,
7727                                            InnerLoopVectorizer &ILV,
7728                                            DominatorTree *DT) {
7729   LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
7730                     << '\n');
7731 
7732   // Perform the actual loop transformation.
7733 
7734   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
7735   VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};
7736   Value *CanonicalIVStartValue;
7737   std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
7738       ILV.createVectorizedLoopSkeleton();
7739   ILV.collectPoisonGeneratingRecipes(State);
7740 
7741   ILV.printDebugTracesAtStart();
7742 
7743   //===------------------------------------------------===//
7744   //
7745   // Notice: any optimization or new instruction that go
7746   // into the code below should also be implemented in
7747   // the cost-model.
7748   //
7749   //===------------------------------------------------===//
7750 
7751   // 2. Copy and widen instructions from the old loop into the new loop.
7752   BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr),
7753                              ILV.getOrCreateVectorTripCount(nullptr),
7754                              CanonicalIVStartValue, State);
7755   BestVPlan.execute(&State);
7756 
7757   // Keep all loop hints from the original loop on the vector loop (we'll
7758   // replace the vectorizer-specific hints below).
7759   MDNode *OrigLoopID = OrigLoop->getLoopID();
7760 
7761   Optional<MDNode *> VectorizedLoopID =
7762       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7763                                       LLVMLoopVectorizeFollowupVectorized});
7764 
7765   Loop *L = LI->getLoopFor(State.CFG.PrevBB);
7766   if (VectorizedLoopID.hasValue())
7767     L->setLoopID(VectorizedLoopID.getValue());
7768   else {
7769     // Keep all loop hints from the original loop on the vector loop (we'll
7770     // replace the vectorizer-specific hints below).
7771     if (MDNode *LID = OrigLoop->getLoopID())
7772       L->setLoopID(LID);
7773 
7774     LoopVectorizeHints Hints(L, true, *ORE);
7775     Hints.setAlreadyVectorized();
7776   }
7777   // Disable runtime unrolling when vectorizing the epilogue loop.
7778   if (CanonicalIVStartValue)
7779     AddRuntimeUnrollDisableMetaData(L);
7780 
7781   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7782   //    predication, updating analyses.
7783   ILV.fixVectorizedLoop(State);
7784 
7785   ILV.printDebugTracesAtEnd();
7786 }
7787 
7788 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
7789 void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
7790   for (const auto &Plan : VPlans)
7791     if (PrintVPlansInDotFormat)
7792       Plan->printDOT(O);
7793     else
7794       Plan->print(O);
7795 }
7796 #endif
7797 
7798 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
7799     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7800 
7801   // We create new control-flow for the vectorized loop, so the original exit
7802   // conditions will be dead after vectorization if it's only used by the
7803   // terminator
7804   SmallVector<BasicBlock*> ExitingBlocks;
7805   OrigLoop->getExitingBlocks(ExitingBlocks);
7806   for (auto *BB : ExitingBlocks) {
7807     auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0));
7808     if (!Cmp || !Cmp->hasOneUse())
7809       continue;
7810 
7811     // TODO: we should introduce a getUniqueExitingBlocks on Loop
7812     if (!DeadInstructions.insert(Cmp).second)
7813       continue;
7814 
7815     // The operands of the icmp is often a dead trunc, used by IndUpdate.
7816     // TODO: can recurse through operands in general
7817     for (Value *Op : Cmp->operands()) {
7818       if (isa<TruncInst>(Op) && Op->hasOneUse())
7819           DeadInstructions.insert(cast<Instruction>(Op));
7820     }
7821   }
7822 
7823   // We create new "steps" for induction variable updates to which the original
7824   // induction variables map. An original update instruction will be dead if
7825   // all its users except the induction variable are dead.
7826   auto *Latch = OrigLoop->getLoopLatch();
7827   for (auto &Induction : Legal->getInductionVars()) {
7828     PHINode *Ind = Induction.first;
7829     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
7830 
7831     // If the tail is to be folded by masking, the primary induction variable,
7832     // if exists, isn't dead: it will be used for masking. Don't kill it.
7833     if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())
7834       continue;
7835 
7836     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
7837           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
7838         }))
7839       DeadInstructions.insert(IndUpdate);
7840   }
7841 }
7842 
7843 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
7844 
7845 //===--------------------------------------------------------------------===//
7846 // EpilogueVectorizerMainLoop
7847 //===--------------------------------------------------------------------===//
7848 
7849 /// This function is partially responsible for generating the control flow
7850 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7851 std::pair<BasicBlock *, Value *>
7852 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
7853   MDNode *OrigLoopID = OrigLoop->getLoopID();
7854   Loop *Lp = createVectorLoopSkeleton("");
7855 
7856   // Generate the code to check the minimum iteration count of the vector
7857   // epilogue (see below).
7858   EPI.EpilogueIterationCountCheck =
7859       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true);
7860   EPI.EpilogueIterationCountCheck->setName("iter.check");
7861 
7862   // Generate the code to check any assumptions that we've made for SCEV
7863   // expressions.
7864   EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader);
7865 
7866   // Generate the code that checks at runtime if arrays overlap. We put the
7867   // checks into a separate block to make the more common case of few elements
7868   // faster.
7869   EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
7870 
7871   // Generate the iteration count check for the main loop, *after* the check
7872   // for the epilogue loop, so that the path-length is shorter for the case
7873   // that goes directly through the vector epilogue. The longer-path length for
7874   // the main loop is compensated for, by the gain from vectorizing the larger
7875   // trip count. Note: the branch will get updated later on when we vectorize
7876   // the epilogue.
7877   EPI.MainLoopIterationCountCheck =
7878       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false);
7879 
7880   // Generate the induction variable.
7881   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
7882   EPI.VectorTripCount = CountRoundDown;
7883   createHeaderBranch(Lp);
7884 
7885   // Skip induction resume value creation here because they will be created in
7886   // the second pass. If we created them here, they wouldn't be used anyway,
7887   // because the vplan in the second pass still contains the inductions from the
7888   // original loop.
7889 
7890   return {completeLoopSkeleton(Lp, OrigLoopID), nullptr};
7891 }
7892 
7893 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7894   LLVM_DEBUG({
7895     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7896            << "Main Loop VF:" << EPI.MainLoopVF
7897            << ", Main Loop UF:" << EPI.MainLoopUF
7898            << ", Epilogue Loop VF:" << EPI.EpilogueVF
7899            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7900   });
7901 }
7902 
7903 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7904   DEBUG_WITH_TYPE(VerboseDebug, {
7905     dbgs() << "intermediate fn:\n"
7906            << *OrigLoop->getHeader()->getParent() << "\n";
7907   });
7908 }
7909 
7910 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
7911     Loop *L, BasicBlock *Bypass, bool ForEpilogue) {
7912   assert(L && "Expected valid Loop.");
7913   assert(Bypass && "Expected valid bypass basic block.");
7914   ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7915   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7916   Value *Count = getOrCreateTripCount(L);
7917   // Reuse existing vector loop preheader for TC checks.
7918   // Note that new preheader block is generated for vector loop.
7919   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7920   IRBuilder<> Builder(TCCheckBlock->getTerminator());
7921 
7922   // Generate code to check if the loop's trip count is less than VF * UF of the
7923   // main vector loop.
7924   auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ?
7925       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7926 
7927   Value *CheckMinIters = Builder.CreateICmp(
7928       P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
7929       "min.iters.check");
7930 
7931   if (!ForEpilogue)
7932     TCCheckBlock->setName("vector.main.loop.iter.check");
7933 
7934   // Create new preheader for vector loop.
7935   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7936                                    DT, LI, nullptr, "vector.ph");
7937 
7938   if (ForEpilogue) {
7939     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7940                                  DT->getNode(Bypass)->getIDom()) &&
7941            "TC check is expected to dominate Bypass");
7942 
7943     // Update dominator for Bypass & LoopExit.
7944     DT->changeImmediateDominator(Bypass, TCCheckBlock);
7945     if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
7946       // For loops with multiple exits, there's no edge from the middle block
7947       // to exit blocks (as the epilogue must run) and thus no need to update
7948       // the immediate dominator of the exit blocks.
7949       DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
7950 
7951     LoopBypassBlocks.push_back(TCCheckBlock);
7952 
7953     // Save the trip count so we don't have to regenerate it in the
7954     // vec.epilog.iter.check. This is safe to do because the trip count
7955     // generated here dominates the vector epilog iter check.
7956     EPI.TripCount = Count;
7957   }
7958 
7959   ReplaceInstWithInst(
7960       TCCheckBlock->getTerminator(),
7961       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7962 
7963   return TCCheckBlock;
7964 }
7965 
7966 //===--------------------------------------------------------------------===//
7967 // EpilogueVectorizerEpilogueLoop
7968 //===--------------------------------------------------------------------===//
7969 
7970 /// This function is partially responsible for generating the control flow
7971 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7972 std::pair<BasicBlock *, Value *>
7973 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
7974   MDNode *OrigLoopID = OrigLoop->getLoopID();
7975   Loop *Lp = createVectorLoopSkeleton("vec.epilog.");
7976 
7977   // Now, compare the remaining count and if there aren't enough iterations to
7978   // execute the vectorized epilogue skip to the scalar part.
7979   BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
7980   VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
7981   LoopVectorPreHeader =
7982       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
7983                  LI, nullptr, "vec.epilog.ph");
7984   emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader,
7985                                           VecEpilogueIterationCountCheck);
7986 
7987   // Adjust the control flow taking the state info from the main loop
7988   // vectorization into account.
7989   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7990          "expected this to be saved from the previous pass.");
7991   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7992       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7993 
7994   DT->changeImmediateDominator(LoopVectorPreHeader,
7995                                EPI.MainLoopIterationCountCheck);
7996 
7997   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7998       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7999 
8000   if (EPI.SCEVSafetyCheck)
8001     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
8002         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8003   if (EPI.MemSafetyCheck)
8004     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
8005         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8006 
8007   DT->changeImmediateDominator(
8008       VecEpilogueIterationCountCheck,
8009       VecEpilogueIterationCountCheck->getSinglePredecessor());
8010 
8011   DT->changeImmediateDominator(LoopScalarPreHeader,
8012                                EPI.EpilogueIterationCountCheck);
8013   if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
8014     // If there is an epilogue which must run, there's no edge from the
8015     // middle block to exit blocks  and thus no need to update the immediate
8016     // dominator of the exit blocks.
8017     DT->changeImmediateDominator(LoopExitBlock,
8018                                  EPI.EpilogueIterationCountCheck);
8019 
8020   // Keep track of bypass blocks, as they feed start values to the induction
8021   // phis in the scalar loop preheader.
8022   if (EPI.SCEVSafetyCheck)
8023     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
8024   if (EPI.MemSafetyCheck)
8025     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
8026   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
8027 
8028   // The vec.epilog.iter.check block may contain Phi nodes from reductions which
8029   // merge control-flow from the latch block and the middle block. Update the
8030   // incoming values here and move the Phi into the preheader.
8031   SmallVector<PHINode *, 4> PhisInBlock;
8032   for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
8033     PhisInBlock.push_back(&Phi);
8034 
8035   for (PHINode *Phi : PhisInBlock) {
8036     Phi->replaceIncomingBlockWith(
8037         VecEpilogueIterationCountCheck->getSinglePredecessor(),
8038         VecEpilogueIterationCountCheck);
8039     Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
8040     if (EPI.SCEVSafetyCheck)
8041       Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
8042     if (EPI.MemSafetyCheck)
8043       Phi->removeIncomingValue(EPI.MemSafetyCheck);
8044     Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
8045   }
8046 
8047   // Generate a resume induction for the vector epilogue and put it in the
8048   // vector epilogue preheader
8049   Type *IdxTy = Legal->getWidestInductionType();
8050   PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
8051                                          LoopVectorPreHeader->getFirstNonPHI());
8052   EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
8053   EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
8054                            EPI.MainLoopIterationCountCheck);
8055 
8056   // Generate the induction variable.
8057   createHeaderBranch(Lp);
8058 
8059   // Generate induction resume values. These variables save the new starting
8060   // indexes for the scalar loop. They are used to test if there are any tail
8061   // iterations left once the vector loop has completed.
8062   // Note that when the vectorized epilogue is skipped due to iteration count
8063   // check, then the resume value for the induction variable comes from
8064   // the trip count of the main vector loop, hence passing the AdditionalBypass
8065   // argument.
8066   createInductionResumeValues(Lp, {VecEpilogueIterationCountCheck,
8067                                    EPI.VectorTripCount} /* AdditionalBypass */);
8068 
8069   return {completeLoopSkeleton(Lp, OrigLoopID), EPResumeVal};
8070 }
8071 
8072 BasicBlock *
8073 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
8074     Loop *L, BasicBlock *Bypass, BasicBlock *Insert) {
8075 
8076   assert(EPI.TripCount &&
8077          "Expected trip count to have been safed in the first pass.");
8078   assert(
8079       (!isa<Instruction>(EPI.TripCount) ||
8080        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
8081       "saved trip count does not dominate insertion point.");
8082   Value *TC = EPI.TripCount;
8083   IRBuilder<> Builder(Insert->getTerminator());
8084   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
8085 
8086   // Generate code to check if the loop's trip count is less than VF * UF of the
8087   // vector epilogue loop.
8088   auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ?
8089       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
8090 
8091   Value *CheckMinIters =
8092       Builder.CreateICmp(P, Count,
8093                          createStepForVF(Builder, Count->getType(),
8094                                          EPI.EpilogueVF, EPI.EpilogueUF),
8095                          "min.epilog.iters.check");
8096 
8097   ReplaceInstWithInst(
8098       Insert->getTerminator(),
8099       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
8100 
8101   LoopBypassBlocks.push_back(Insert);
8102   return Insert;
8103 }
8104 
8105 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
8106   LLVM_DEBUG({
8107     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
8108            << "Epilogue Loop VF:" << EPI.EpilogueVF
8109            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8110   });
8111 }
8112 
8113 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
8114   DEBUG_WITH_TYPE(VerboseDebug, {
8115     dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
8116   });
8117 }
8118 
8119 bool LoopVectorizationPlanner::getDecisionAndClampRange(
8120     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
8121   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
8122   bool PredicateAtRangeStart = Predicate(Range.Start);
8123 
8124   for (ElementCount TmpVF = Range.Start * 2;
8125        ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
8126     if (Predicate(TmpVF) != PredicateAtRangeStart) {
8127       Range.End = TmpVF;
8128       break;
8129     }
8130 
8131   return PredicateAtRangeStart;
8132 }
8133 
8134 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
8135 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
8136 /// of VF's starting at a given VF and extending it as much as possible. Each
8137 /// vectorization decision can potentially shorten this sub-range during
8138 /// buildVPlan().
8139 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
8140                                            ElementCount MaxVF) {
8141   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8142   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8143     VFRange SubRange = {VF, MaxVFPlusOne};
8144     VPlans.push_back(buildVPlan(SubRange));
8145     VF = SubRange.End;
8146   }
8147 }
8148 
8149 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
8150                                          VPlanPtr &Plan) {
8151   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8152 
8153   // Look for cached value.
8154   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8155   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8156   if (ECEntryIt != EdgeMaskCache.end())
8157     return ECEntryIt->second;
8158 
8159   VPValue *SrcMask = createBlockInMask(Src, Plan);
8160 
8161   // The terminator has to be a branch inst!
8162   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8163   assert(BI && "Unexpected terminator found");
8164 
8165   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8166     return EdgeMaskCache[Edge] = SrcMask;
8167 
8168   // If source is an exiting block, we know the exit edge is dynamically dead
8169   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
8170   // adding uses of an otherwise potentially dead instruction.
8171   if (OrigLoop->isLoopExiting(Src))
8172     return EdgeMaskCache[Edge] = SrcMask;
8173 
8174   VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
8175   assert(EdgeMask && "No Edge Mask found for condition");
8176 
8177   if (BI->getSuccessor(0) != Dst)
8178     EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
8179 
8180   if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8181     // The condition is 'SrcMask && EdgeMask', which is equivalent to
8182     // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8183     // The select version does not introduce new UB if SrcMask is false and
8184     // EdgeMask is poison. Using 'and' here introduces undefined behavior.
8185     VPValue *False = Plan->getOrAddVPValue(
8186         ConstantInt::getFalse(BI->getCondition()->getType()));
8187     EdgeMask =
8188         Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc());
8189   }
8190 
8191   return EdgeMaskCache[Edge] = EdgeMask;
8192 }
8193 
8194 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
8195   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8196 
8197   // Look for cached value.
8198   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
8199   if (BCEntryIt != BlockMaskCache.end())
8200     return BCEntryIt->second;
8201 
8202   // All-one mask is modelled as no-mask following the convention for masked
8203   // load/store/gather/scatter. Initialize BlockMask to no-mask.
8204   VPValue *BlockMask = nullptr;
8205 
8206   if (OrigLoop->getHeader() == BB) {
8207     if (!CM.blockNeedsPredicationForAnyReason(BB))
8208       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
8209 
8210     // Introduce the early-exit compare IV <= BTC to form header block mask.
8211     // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8212     // constructing the desired canonical IV in the header block as its first
8213     // non-phi instructions.
8214     assert(CM.foldTailByMasking() && "must fold the tail");
8215     VPBasicBlock *HeaderVPBB = Plan->getEntry()->getEntryBasicBlock();
8216     auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8217     auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV());
8218     HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi());
8219 
8220     VPBuilder::InsertPointGuard Guard(Builder);
8221     Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
8222     if (CM.TTI.emitGetActiveLaneMask()) {
8223       VPValue *TC = Plan->getOrCreateTripCount();
8224       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC});
8225     } else {
8226       VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
8227       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
8228     }
8229     return BlockMaskCache[BB] = BlockMask;
8230   }
8231 
8232   // This is the block mask. We OR all incoming edges.
8233   for (auto *Predecessor : predecessors(BB)) {
8234     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8235     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
8236       return BlockMaskCache[BB] = EdgeMask;
8237 
8238     if (!BlockMask) { // BlockMask has its initialized nullptr value.
8239       BlockMask = EdgeMask;
8240       continue;
8241     }
8242 
8243     BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8244   }
8245 
8246   return BlockMaskCache[BB] = BlockMask;
8247 }
8248 
8249 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
8250                                                 ArrayRef<VPValue *> Operands,
8251                                                 VFRange &Range,
8252                                                 VPlanPtr &Plan) {
8253   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8254          "Must be called with either a load or store");
8255 
8256   auto willWiden = [&](ElementCount VF) -> bool {
8257     if (VF.isScalar())
8258       return false;
8259     LoopVectorizationCostModel::InstWidening Decision =
8260         CM.getWideningDecision(I, VF);
8261     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8262            "CM decision should be taken at this point.");
8263     if (Decision == LoopVectorizationCostModel::CM_Interleave)
8264       return true;
8265     if (CM.isScalarAfterVectorization(I, VF) ||
8266         CM.isProfitableToScalarize(I, VF))
8267       return false;
8268     return Decision != LoopVectorizationCostModel::CM_Scalarize;
8269   };
8270 
8271   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8272     return nullptr;
8273 
8274   VPValue *Mask = nullptr;
8275   if (Legal->isMaskRequired(I))
8276     Mask = createBlockInMask(I->getParent(), Plan);
8277 
8278   // Determine if the pointer operand of the access is either consecutive or
8279   // reverse consecutive.
8280   LoopVectorizationCostModel::InstWidening Decision =
8281       CM.getWideningDecision(I, Range.Start);
8282   bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
8283   bool Consecutive =
8284       Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
8285 
8286   if (LoadInst *Load = dyn_cast<LoadInst>(I))
8287     return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask,
8288                                               Consecutive, Reverse);
8289 
8290   StoreInst *Store = cast<StoreInst>(I);
8291   return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0],
8292                                             Mask, Consecutive, Reverse);
8293 }
8294 
8295 static VPWidenIntOrFpInductionRecipe *
8296 createWidenInductionRecipe(PHINode *Phi, Instruction *PhiOrTrunc,
8297                            VPValue *Start, const InductionDescriptor &IndDesc,
8298                            LoopVectorizationCostModel &CM, ScalarEvolution &SE,
8299                            Loop &OrigLoop, VFRange &Range) {
8300   // Returns true if an instruction \p I should be scalarized instead of
8301   // vectorized for the chosen vectorization factor.
8302   auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) {
8303     return CM.isScalarAfterVectorization(I, VF) ||
8304            CM.isProfitableToScalarize(I, VF);
8305   };
8306 
8307   bool NeedsScalarIV = LoopVectorizationPlanner::getDecisionAndClampRange(
8308       [&](ElementCount VF) {
8309         // Returns true if we should generate a scalar version of \p IV.
8310         if (ShouldScalarizeInstruction(PhiOrTrunc, VF))
8311           return true;
8312         auto isScalarInst = [&](User *U) -> bool {
8313           auto *I = cast<Instruction>(U);
8314           return OrigLoop.contains(I) && ShouldScalarizeInstruction(I, VF);
8315         };
8316         return any_of(PhiOrTrunc->users(), isScalarInst);
8317       },
8318       Range);
8319   bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange(
8320       [&](ElementCount VF) {
8321         return ShouldScalarizeInstruction(PhiOrTrunc, VF);
8322       },
8323       Range);
8324   assert(IndDesc.getStartValue() ==
8325          Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
8326   assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
8327          "step must be loop invariant");
8328   if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8329     return new VPWidenIntOrFpInductionRecipe(
8330         Phi, Start, IndDesc, TruncI, NeedsScalarIV, !NeedsScalarIVOnly, SE);
8331   }
8332   assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
8333   return new VPWidenIntOrFpInductionRecipe(Phi, Start, IndDesc, NeedsScalarIV,
8334                                            !NeedsScalarIVOnly, SE);
8335 }
8336 
8337 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
8338     PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) const {
8339 
8340   // Check if this is an integer or fp induction. If so, build the recipe that
8341   // produces its scalar and vector values.
8342   if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8343     return createWidenInductionRecipe(Phi, Phi, Operands[0], *II, CM,
8344                                       *PSE.getSE(), *OrigLoop, Range);
8345 
8346   return nullptr;
8347 }
8348 
8349 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8350     TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range,
8351     VPlan &Plan) const {
8352   // Optimize the special case where the source is a constant integer
8353   // induction variable. Notice that we can only optimize the 'trunc' case
8354   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8355   // (c) other casts depend on pointer size.
8356 
8357   // Determine whether \p K is a truncation based on an induction variable that
8358   // can be optimized.
8359   auto isOptimizableIVTruncate =
8360       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8361     return [=](ElementCount VF) -> bool {
8362       return CM.isOptimizableIVTruncate(K, VF);
8363     };
8364   };
8365 
8366   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8367           isOptimizableIVTruncate(I), Range)) {
8368 
8369     auto *Phi = cast<PHINode>(I->getOperand(0));
8370     const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8371     VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
8372     return createWidenInductionRecipe(Phi, I, Start, II, CM, *PSE.getSE(),
8373                                       *OrigLoop, Range);
8374   }
8375   return nullptr;
8376 }
8377 
8378 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
8379                                                 ArrayRef<VPValue *> Operands,
8380                                                 VPlanPtr &Plan) {
8381   // If all incoming values are equal, the incoming VPValue can be used directly
8382   // instead of creating a new VPBlendRecipe.
8383   VPValue *FirstIncoming = Operands[0];
8384   if (all_of(Operands, [FirstIncoming](const VPValue *Inc) {
8385         return FirstIncoming == Inc;
8386       })) {
8387     return Operands[0];
8388   }
8389 
8390   unsigned NumIncoming = Phi->getNumIncomingValues();
8391   // For in-loop reductions, we do not need to create an additional select.
8392   VPValue *InLoopVal = nullptr;
8393   for (unsigned In = 0; In < NumIncoming; In++) {
8394     PHINode *PhiOp =
8395         dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue());
8396     if (PhiOp && CM.isInLoopReduction(PhiOp)) {
8397       assert(!InLoopVal && "Found more than one in-loop reduction!");
8398       InLoopVal = Operands[In];
8399     }
8400   }
8401 
8402   assert((!InLoopVal || NumIncoming == 2) &&
8403          "Found an in-loop reduction for PHI with unexpected number of "
8404          "incoming values");
8405   if (InLoopVal)
8406     return Operands[Operands[0] == InLoopVal ? 1 : 0];
8407 
8408   // We know that all PHIs in non-header blocks are converted into selects, so
8409   // we don't have to worry about the insertion order and we can just use the
8410   // builder. At this point we generate the predication tree. There may be
8411   // duplications since this is a simple recursive scan, but future
8412   // optimizations will clean it up.
8413   SmallVector<VPValue *, 2> OperandsWithMask;
8414 
8415   for (unsigned In = 0; In < NumIncoming; In++) {
8416     VPValue *EdgeMask =
8417       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
8418     assert((EdgeMask || NumIncoming == 1) &&
8419            "Multiple predecessors with one having a full mask");
8420     OperandsWithMask.push_back(Operands[In]);
8421     if (EdgeMask)
8422       OperandsWithMask.push_back(EdgeMask);
8423   }
8424   return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask));
8425 }
8426 
8427 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8428                                                    ArrayRef<VPValue *> Operands,
8429                                                    VFRange &Range) const {
8430 
8431   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8432       [this, CI](ElementCount VF) {
8433         return CM.isScalarWithPredication(CI, VF);
8434       },
8435       Range);
8436 
8437   if (IsPredicated)
8438     return nullptr;
8439 
8440   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8441   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8442              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8443              ID == Intrinsic::pseudoprobe ||
8444              ID == Intrinsic::experimental_noalias_scope_decl))
8445     return nullptr;
8446 
8447   auto willWiden = [&](ElementCount VF) -> bool {
8448     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8449     // The following case may be scalarized depending on the VF.
8450     // The flag shows whether we use Intrinsic or a usual Call for vectorized
8451     // version of the instruction.
8452     // Is it beneficial to perform intrinsic call compared to lib call?
8453     bool NeedToScalarize = false;
8454     InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
8455     InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;
8456     bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
8457     return UseVectorIntrinsic || !NeedToScalarize;
8458   };
8459 
8460   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8461     return nullptr;
8462 
8463   ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size());
8464   return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()));
8465 }
8466 
8467 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8468   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8469          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8470   // Instruction should be widened, unless it is scalar after vectorization,
8471   // scalarization is profitable or it is predicated.
8472   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8473     return CM.isScalarAfterVectorization(I, VF) ||
8474            CM.isProfitableToScalarize(I, VF) ||
8475            CM.isScalarWithPredication(I, VF);
8476   };
8477   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8478                                                              Range);
8479 }
8480 
8481 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8482                                            ArrayRef<VPValue *> Operands) const {
8483   auto IsVectorizableOpcode = [](unsigned Opcode) {
8484     switch (Opcode) {
8485     case Instruction::Add:
8486     case Instruction::And:
8487     case Instruction::AShr:
8488     case Instruction::BitCast:
8489     case Instruction::FAdd:
8490     case Instruction::FCmp:
8491     case Instruction::FDiv:
8492     case Instruction::FMul:
8493     case Instruction::FNeg:
8494     case Instruction::FPExt:
8495     case Instruction::FPToSI:
8496     case Instruction::FPToUI:
8497     case Instruction::FPTrunc:
8498     case Instruction::FRem:
8499     case Instruction::FSub:
8500     case Instruction::ICmp:
8501     case Instruction::IntToPtr:
8502     case Instruction::LShr:
8503     case Instruction::Mul:
8504     case Instruction::Or:
8505     case Instruction::PtrToInt:
8506     case Instruction::SDiv:
8507     case Instruction::Select:
8508     case Instruction::SExt:
8509     case Instruction::Shl:
8510     case Instruction::SIToFP:
8511     case Instruction::SRem:
8512     case Instruction::Sub:
8513     case Instruction::Trunc:
8514     case Instruction::UDiv:
8515     case Instruction::UIToFP:
8516     case Instruction::URem:
8517     case Instruction::Xor:
8518     case Instruction::ZExt:
8519       return true;
8520     }
8521     return false;
8522   };
8523 
8524   if (!IsVectorizableOpcode(I->getOpcode()))
8525     return nullptr;
8526 
8527   // Success: widen this instruction.
8528   return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8529 }
8530 
8531 void VPRecipeBuilder::fixHeaderPhis() {
8532   BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8533   for (VPHeaderPHIRecipe *R : PhisToFix) {
8534     auto *PN = cast<PHINode>(R->getUnderlyingValue());
8535     VPRecipeBase *IncR =
8536         getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8537     R->addOperand(IncR->getVPSingleValue());
8538   }
8539 }
8540 
8541 VPBasicBlock *VPRecipeBuilder::handleReplication(
8542     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
8543     VPlanPtr &Plan) {
8544   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8545       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8546       Range);
8547 
8548   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8549       [&](ElementCount VF) { return CM.isPredicatedInst(I, VF, IsUniform); },
8550       Range);
8551 
8552   // Even if the instruction is not marked as uniform, there are certain
8553   // intrinsic calls that can be effectively treated as such, so we check for
8554   // them here. Conservatively, we only do this for scalable vectors, since
8555   // for fixed-width VFs we can always fall back on full scalarization.
8556   if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8557     switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8558     case Intrinsic::assume:
8559     case Intrinsic::lifetime_start:
8560     case Intrinsic::lifetime_end:
8561       // For scalable vectors if one of the operands is variant then we still
8562       // want to mark as uniform, which will generate one instruction for just
8563       // the first lane of the vector. We can't scalarize the call in the same
8564       // way as for fixed-width vectors because we don't know how many lanes
8565       // there are.
8566       //
8567       // The reasons for doing it this way for scalable vectors are:
8568       //   1. For the assume intrinsic generating the instruction for the first
8569       //      lane is still be better than not generating any at all. For
8570       //      example, the input may be a splat across all lanes.
8571       //   2. For the lifetime start/end intrinsics the pointer operand only
8572       //      does anything useful when the input comes from a stack object,
8573       //      which suggests it should always be uniform. For non-stack objects
8574       //      the effect is to poison the object, which still allows us to
8575       //      remove the call.
8576       IsUniform = true;
8577       break;
8578     default:
8579       break;
8580     }
8581   }
8582 
8583   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
8584                                        IsUniform, IsPredicated);
8585   setRecipe(I, Recipe);
8586   Plan->addVPValue(I, Recipe);
8587 
8588   // Find if I uses a predicated instruction. If so, it will use its scalar
8589   // value. Avoid hoisting the insert-element which packs the scalar value into
8590   // a vector value, as that happens iff all users use the vector value.
8591   for (VPValue *Op : Recipe->operands()) {
8592     auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef());
8593     if (!PredR)
8594       continue;
8595     auto *RepR =
8596         cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef());
8597     assert(RepR->isPredicated() &&
8598            "expected Replicate recipe to be predicated");
8599     RepR->setAlsoPack(false);
8600   }
8601 
8602   // Finalize the recipe for Instr, first if it is not predicated.
8603   if (!IsPredicated) {
8604     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8605     VPBB->appendRecipe(Recipe);
8606     return VPBB;
8607   }
8608   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8609 
8610   VPBlockBase *SingleSucc = VPBB->getSingleSuccessor();
8611   assert(SingleSucc && "VPBB must have a single successor when handling "
8612                        "predicated replication.");
8613   VPBlockUtils::disconnectBlocks(VPBB, SingleSucc);
8614   // Record predicated instructions for above packing optimizations.
8615   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
8616   VPBlockUtils::insertBlockAfter(Region, VPBB);
8617   auto *RegSucc = new VPBasicBlock();
8618   VPBlockUtils::insertBlockAfter(RegSucc, Region);
8619   VPBlockUtils::connectBlocks(RegSucc, SingleSucc);
8620   return RegSucc;
8621 }
8622 
8623 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
8624                                                       VPRecipeBase *PredRecipe,
8625                                                       VPlanPtr &Plan) {
8626   // Instructions marked for predication are replicated and placed under an
8627   // if-then construct to prevent side-effects.
8628 
8629   // Generate recipes to compute the block mask for this region.
8630   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
8631 
8632   // Build the triangular if-then region.
8633   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
8634   assert(Instr->getParent() && "Predicated instruction not in any basic block");
8635   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
8636   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
8637   auto *PHIRecipe = Instr->getType()->isVoidTy()
8638                         ? nullptr
8639                         : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));
8640   if (PHIRecipe) {
8641     Plan->removeVPValueFor(Instr);
8642     Plan->addVPValue(Instr, PHIRecipe);
8643   }
8644   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
8645   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
8646   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
8647 
8648   // Note: first set Entry as region entry and then connect successors starting
8649   // from it in order, to propagate the "parent" of each VPBasicBlock.
8650   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
8651   VPBlockUtils::connectBlocks(Pred, Exit);
8652 
8653   return Region;
8654 }
8655 
8656 VPRecipeOrVPValueTy
8657 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8658                                         ArrayRef<VPValue *> Operands,
8659                                         VFRange &Range, VPlanPtr &Plan) {
8660   // First, check for specific widening recipes that deal with calls, memory
8661   // operations, inductions and Phi nodes.
8662   if (auto *CI = dyn_cast<CallInst>(Instr))
8663     return toVPRecipeResult(tryToWidenCall(CI, Operands, Range));
8664 
8665   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8666     return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
8667 
8668   VPRecipeBase *Recipe;
8669   if (auto Phi = dyn_cast<PHINode>(Instr)) {
8670     if (Phi->getParent() != OrigLoop->getHeader())
8671       return tryToBlend(Phi, Operands, Plan);
8672     if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
8673       return toVPRecipeResult(Recipe);
8674 
8675     VPHeaderPHIRecipe *PhiRecipe = nullptr;
8676     if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) {
8677       VPValue *StartV = Operands[0];
8678       if (Legal->isReductionVariable(Phi)) {
8679         const RecurrenceDescriptor &RdxDesc =
8680             Legal->getReductionVars().find(Phi)->second;
8681         assert(RdxDesc.getRecurrenceStartValue() ==
8682                Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8683         PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8684                                              CM.isInLoopReduction(Phi),
8685                                              CM.useOrderedReductions(RdxDesc));
8686       } else {
8687         PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8688       }
8689 
8690       // Record the incoming value from the backedge, so we can add the incoming
8691       // value from the backedge after all recipes have been created.
8692       recordRecipeOf(cast<Instruction>(
8693           Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())));
8694       PhisToFix.push_back(PhiRecipe);
8695     } else {
8696       // TODO: record backedge value for remaining pointer induction phis.
8697       assert(Phi->getType()->isPointerTy() &&
8698              "only pointer phis should be handled here");
8699       assert(Legal->getInductionVars().count(Phi) &&
8700              "Not an induction variable");
8701       InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
8702       VPValue *Start = Plan->getOrAddVPValue(II.getStartValue());
8703       PhiRecipe = new VPWidenPHIRecipe(Phi, Start);
8704     }
8705 
8706     return toVPRecipeResult(PhiRecipe);
8707   }
8708 
8709   if (isa<TruncInst>(Instr) &&
8710       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,
8711                                                Range, *Plan)))
8712     return toVPRecipeResult(Recipe);
8713 
8714   if (!shouldWiden(Instr, Range))
8715     return nullptr;
8716 
8717   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8718     return toVPRecipeResult(new VPWidenGEPRecipe(
8719         GEP, make_range(Operands.begin(), Operands.end()), OrigLoop));
8720 
8721   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8722     bool InvariantCond =
8723         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
8724     return toVPRecipeResult(new VPWidenSelectRecipe(
8725         *SI, make_range(Operands.begin(), Operands.end()), InvariantCond));
8726   }
8727 
8728   return toVPRecipeResult(tryToWiden(Instr, Operands));
8729 }
8730 
8731 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8732                                                         ElementCount MaxVF) {
8733   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8734 
8735   // Collect instructions from the original loop that will become trivially dead
8736   // in the vectorized loop. We don't need to vectorize these instructions. For
8737   // example, original induction update instructions can become dead because we
8738   // separately emit induction "steps" when generating code for the new loop.
8739   // Similarly, we create a new latch condition when setting up the structure
8740   // of the new loop, so the old one can become dead.
8741   SmallPtrSet<Instruction *, 4> DeadInstructions;
8742   collectTriviallyDeadInstructions(DeadInstructions);
8743 
8744   // Add assume instructions we need to drop to DeadInstructions, to prevent
8745   // them from being added to the VPlan.
8746   // TODO: We only need to drop assumes in blocks that get flattend. If the
8747   // control flow is preserved, we should keep them.
8748   auto &ConditionalAssumes = Legal->getConditionalAssumes();
8749   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
8750 
8751   MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
8752   // Dead instructions do not need sinking. Remove them from SinkAfter.
8753   for (Instruction *I : DeadInstructions)
8754     SinkAfter.erase(I);
8755 
8756   // Cannot sink instructions after dead instructions (there won't be any
8757   // recipes for them). Instead, find the first non-dead previous instruction.
8758   for (auto &P : Legal->getSinkAfter()) {
8759     Instruction *SinkTarget = P.second;
8760     Instruction *FirstInst = &*SinkTarget->getParent()->begin();
8761     (void)FirstInst;
8762     while (DeadInstructions.contains(SinkTarget)) {
8763       assert(
8764           SinkTarget != FirstInst &&
8765           "Must find a live instruction (at least the one feeding the "
8766           "first-order recurrence PHI) before reaching beginning of the block");
8767       SinkTarget = SinkTarget->getPrevNode();
8768       assert(SinkTarget != P.first &&
8769              "sink source equals target, no sinking required");
8770     }
8771     P.second = SinkTarget;
8772   }
8773 
8774   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8775   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8776     VFRange SubRange = {VF, MaxVFPlusOne};
8777     VPlans.push_back(
8778         buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
8779     VF = SubRange.End;
8780   }
8781 }
8782 
8783 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header, a
8784 // CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a
8785 // BranchOnCount VPInstruction to the latch.
8786 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
8787                                   bool HasNUW, bool IsVPlanNative) {
8788   Value *StartIdx = ConstantInt::get(IdxTy, 0);
8789   auto *StartV = Plan.getOrAddVPValue(StartIdx);
8790 
8791   auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
8792   VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
8793   VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
8794   if (IsVPlanNative)
8795     Header = cast<VPBasicBlock>(Header->getSingleSuccessor());
8796   Header->insert(CanonicalIVPHI, Header->begin());
8797 
8798   auto *CanonicalIVIncrement =
8799       new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW
8800                                : VPInstruction::CanonicalIVIncrement,
8801                         {CanonicalIVPHI}, DL);
8802   CanonicalIVPHI->addOperand(CanonicalIVIncrement);
8803 
8804   VPBasicBlock *EB = TopRegion->getExitBasicBlock();
8805   if (IsVPlanNative) {
8806     EB = cast<VPBasicBlock>(EB->getSinglePredecessor());
8807     EB->setCondBit(nullptr);
8808   }
8809   EB->appendRecipe(CanonicalIVIncrement);
8810 
8811   auto *BranchOnCount =
8812       new VPInstruction(VPInstruction::BranchOnCount,
8813                         {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
8814   EB->appendRecipe(BranchOnCount);
8815 }
8816 
8817 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
8818     VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
8819     const MapVector<Instruction *, Instruction *> &SinkAfter) {
8820 
8821   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8822 
8823   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
8824 
8825   // ---------------------------------------------------------------------------
8826   // Pre-construction: record ingredients whose recipes we'll need to further
8827   // process after constructing the initial VPlan.
8828   // ---------------------------------------------------------------------------
8829 
8830   // Mark instructions we'll need to sink later and their targets as
8831   // ingredients whose recipe we'll need to record.
8832   for (auto &Entry : SinkAfter) {
8833     RecipeBuilder.recordRecipeOf(Entry.first);
8834     RecipeBuilder.recordRecipeOf(Entry.second);
8835   }
8836   for (auto &Reduction : CM.getInLoopReductionChains()) {
8837     PHINode *Phi = Reduction.first;
8838     RecurKind Kind =
8839         Legal->getReductionVars().find(Phi)->second.getRecurrenceKind();
8840     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8841 
8842     RecipeBuilder.recordRecipeOf(Phi);
8843     for (auto &R : ReductionOperations) {
8844       RecipeBuilder.recordRecipeOf(R);
8845       // For min/max reducitons, where we have a pair of icmp/select, we also
8846       // need to record the ICmp recipe, so it can be removed later.
8847       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
8848              "Only min/max recurrences allowed for inloop reductions");
8849       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
8850         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
8851     }
8852   }
8853 
8854   // For each interleave group which is relevant for this (possibly trimmed)
8855   // Range, add it to the set of groups to be later applied to the VPlan and add
8856   // placeholders for its members' Recipes which we'll be replacing with a
8857   // single VPInterleaveRecipe.
8858   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8859     auto applyIG = [IG, this](ElementCount VF) -> bool {
8860       return (VF.isVector() && // Query is illegal for VF == 1
8861               CM.getWideningDecision(IG->getInsertPos(), VF) ==
8862                   LoopVectorizationCostModel::CM_Interleave);
8863     };
8864     if (!getDecisionAndClampRange(applyIG, Range))
8865       continue;
8866     InterleaveGroups.insert(IG);
8867     for (unsigned i = 0; i < IG->getFactor(); i++)
8868       if (Instruction *Member = IG->getMember(i))
8869         RecipeBuilder.recordRecipeOf(Member);
8870   };
8871 
8872   // ---------------------------------------------------------------------------
8873   // Build initial VPlan: Scan the body of the loop in a topological order to
8874   // visit each basic block after having visited its predecessor basic blocks.
8875   // ---------------------------------------------------------------------------
8876 
8877   // Create initial VPlan skeleton, with separate header and latch blocks.
8878   VPBasicBlock *HeaderVPBB = new VPBasicBlock();
8879   VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
8880   VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
8881   auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop");
8882   auto Plan = std::make_unique<VPlan>(TopRegion);
8883 
8884   Instruction *DLInst =
8885       getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
8886   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(),
8887                         DLInst ? DLInst->getDebugLoc() : DebugLoc(),
8888                         !CM.foldTailByMasking(), false);
8889 
8890   // Scan the body of the loop in a topological order to visit each basic block
8891   // after having visited its predecessor basic blocks.
8892   LoopBlocksDFS DFS(OrigLoop);
8893   DFS.perform(LI);
8894 
8895   VPBasicBlock *VPBB = HeaderVPBB;
8896   SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove;
8897   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8898     // Relevant instructions from basic block BB will be grouped into VPRecipe
8899     // ingredients and fill a new VPBasicBlock.
8900     unsigned VPBBsForBB = 0;
8901     VPBB->setName(BB->getName());
8902     Builder.setInsertPoint(VPBB);
8903 
8904     // Introduce each ingredient into VPlan.
8905     // TODO: Model and preserve debug instrinsics in VPlan.
8906     for (Instruction &I : BB->instructionsWithoutDebug()) {
8907       Instruction *Instr = &I;
8908 
8909       // First filter out irrelevant instructions, to ensure no recipes are
8910       // built for them.
8911       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
8912         continue;
8913 
8914       SmallVector<VPValue *, 4> Operands;
8915       auto *Phi = dyn_cast<PHINode>(Instr);
8916       if (Phi && Phi->getParent() == OrigLoop->getHeader()) {
8917         Operands.push_back(Plan->getOrAddVPValue(
8918             Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
8919       } else {
8920         auto OpRange = Plan->mapToVPValues(Instr->operands());
8921         Operands = {OpRange.begin(), OpRange.end()};
8922       }
8923       if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
8924               Instr, Operands, Range, Plan)) {
8925         // If Instr can be simplified to an existing VPValue, use it.
8926         if (RecipeOrValue.is<VPValue *>()) {
8927           auto *VPV = RecipeOrValue.get<VPValue *>();
8928           Plan->addVPValue(Instr, VPV);
8929           // If the re-used value is a recipe, register the recipe for the
8930           // instruction, in case the recipe for Instr needs to be recorded.
8931           if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef()))
8932             RecipeBuilder.setRecipe(Instr, R);
8933           continue;
8934         }
8935         // Otherwise, add the new recipe.
8936         VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>();
8937         for (auto *Def : Recipe->definedValues()) {
8938           auto *UV = Def->getUnderlyingValue();
8939           Plan->addVPValue(UV, Def);
8940         }
8941 
8942         if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) &&
8943             HeaderVPBB->getFirstNonPhi() != VPBB->end()) {
8944           // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section
8945           // of the header block. That can happen for truncates of induction
8946           // variables. Those recipes are moved to the phi section of the header
8947           // block after applying SinkAfter, which relies on the original
8948           // position of the trunc.
8949           assert(isa<TruncInst>(Instr));
8950           InductionsToMove.push_back(
8951               cast<VPWidenIntOrFpInductionRecipe>(Recipe));
8952         }
8953         RecipeBuilder.setRecipe(Instr, Recipe);
8954         VPBB->appendRecipe(Recipe);
8955         continue;
8956       }
8957 
8958       // Otherwise, if all widening options failed, Instruction is to be
8959       // replicated. This may create a successor for VPBB.
8960       VPBasicBlock *NextVPBB =
8961           RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan);
8962       if (NextVPBB != VPBB) {
8963         VPBB = NextVPBB;
8964         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
8965                                     : "");
8966       }
8967     }
8968 
8969     VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
8970     VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
8971   }
8972 
8973   // Fold the last, empty block into its predecessor.
8974   VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB);
8975   assert(VPBB && "expected to fold last (empty) block");
8976   // After here, VPBB should not be used.
8977   VPBB = nullptr;
8978 
8979   assert(isa<VPRegionBlock>(Plan->getEntry()) &&
8980          !Plan->getEntry()->getEntryBasicBlock()->empty() &&
8981          "entry block must be set to a VPRegionBlock having a non-empty entry "
8982          "VPBasicBlock");
8983   RecipeBuilder.fixHeaderPhis();
8984 
8985   // ---------------------------------------------------------------------------
8986   // Transform initial VPlan: Apply previously taken decisions, in order, to
8987   // bring the VPlan to its final state.
8988   // ---------------------------------------------------------------------------
8989 
8990   // Apply Sink-After legal constraints.
8991   auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * {
8992     auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent());
8993     if (Region && Region->isReplicator()) {
8994       assert(Region->getNumSuccessors() == 1 &&
8995              Region->getNumPredecessors() == 1 && "Expected SESE region!");
8996       assert(R->getParent()->size() == 1 &&
8997              "A recipe in an original replicator region must be the only "
8998              "recipe in its block");
8999       return Region;
9000     }
9001     return nullptr;
9002   };
9003   for (auto &Entry : SinkAfter) {
9004     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
9005     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
9006 
9007     auto *TargetRegion = GetReplicateRegion(Target);
9008     auto *SinkRegion = GetReplicateRegion(Sink);
9009     if (!SinkRegion) {
9010       // If the sink source is not a replicate region, sink the recipe directly.
9011       if (TargetRegion) {
9012         // The target is in a replication region, make sure to move Sink to
9013         // the block after it, not into the replication region itself.
9014         VPBasicBlock *NextBlock =
9015             cast<VPBasicBlock>(TargetRegion->getSuccessors().front());
9016         Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi());
9017       } else
9018         Sink->moveAfter(Target);
9019       continue;
9020     }
9021 
9022     // The sink source is in a replicate region. Unhook the region from the CFG.
9023     auto *SinkPred = SinkRegion->getSinglePredecessor();
9024     auto *SinkSucc = SinkRegion->getSingleSuccessor();
9025     VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion);
9026     VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc);
9027     VPBlockUtils::connectBlocks(SinkPred, SinkSucc);
9028 
9029     if (TargetRegion) {
9030       // The target recipe is also in a replicate region, move the sink region
9031       // after the target region.
9032       auto *TargetSucc = TargetRegion->getSingleSuccessor();
9033       VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc);
9034       VPBlockUtils::connectBlocks(TargetRegion, SinkRegion);
9035       VPBlockUtils::connectBlocks(SinkRegion, TargetSucc);
9036     } else {
9037       // The sink source is in a replicate region, we need to move the whole
9038       // replicate region, which should only contain a single recipe in the
9039       // main block.
9040       auto *SplitBlock =
9041           Target->getParent()->splitAt(std::next(Target->getIterator()));
9042 
9043       auto *SplitPred = SplitBlock->getSinglePredecessor();
9044 
9045       VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock);
9046       VPBlockUtils::connectBlocks(SplitPred, SinkRegion);
9047       VPBlockUtils::connectBlocks(SinkRegion, SplitBlock);
9048     }
9049   }
9050 
9051   VPlanTransforms::removeRedundantCanonicalIVs(*Plan);
9052   VPlanTransforms::removeRedundantInductionCasts(*Plan);
9053 
9054   // Now that sink-after is done, move induction recipes for optimized truncates
9055   // to the phi section of the header block.
9056   for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove)
9057     Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
9058 
9059   // Adjust the recipes for any inloop reductions.
9060   adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExit()), Plan,
9061                              RecipeBuilder, Range.Start);
9062 
9063   // Introduce a recipe to combine the incoming and previous values of a
9064   // first-order recurrence.
9065   for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) {
9066     auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R);
9067     if (!RecurPhi)
9068       continue;
9069 
9070     VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe();
9071     VPBasicBlock *InsertBlock = PrevRecipe->getParent();
9072     auto *Region = GetReplicateRegion(PrevRecipe);
9073     if (Region)
9074       InsertBlock = cast<VPBasicBlock>(Region->getSingleSuccessor());
9075     if (Region || PrevRecipe->isPhi())
9076       Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
9077     else
9078       Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator()));
9079 
9080     auto *RecurSplice = cast<VPInstruction>(
9081         Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice,
9082                              {RecurPhi, RecurPhi->getBackedgeValue()}));
9083 
9084     RecurPhi->replaceAllUsesWith(RecurSplice);
9085     // Set the first operand of RecurSplice to RecurPhi again, after replacing
9086     // all users.
9087     RecurSplice->setOperand(0, RecurPhi);
9088   }
9089 
9090   // Interleave memory: for each Interleave Group we marked earlier as relevant
9091   // for this VPlan, replace the Recipes widening its memory instructions with a
9092   // single VPInterleaveRecipe at its insertion point.
9093   for (auto IG : InterleaveGroups) {
9094     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
9095         RecipeBuilder.getRecipe(IG->getInsertPos()));
9096     SmallVector<VPValue *, 4> StoredValues;
9097     for (unsigned i = 0; i < IG->getFactor(); ++i)
9098       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
9099         auto *StoreR =
9100             cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI));
9101         StoredValues.push_back(StoreR->getStoredValue());
9102       }
9103 
9104     auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
9105                                         Recipe->getMask());
9106     VPIG->insertBefore(Recipe);
9107     unsigned J = 0;
9108     for (unsigned i = 0; i < IG->getFactor(); ++i)
9109       if (Instruction *Member = IG->getMember(i)) {
9110         if (!Member->getType()->isVoidTy()) {
9111           VPValue *OriginalV = Plan->getVPValue(Member);
9112           Plan->removeVPValueFor(Member);
9113           Plan->addVPValue(Member, VPIG->getVPValue(J));
9114           OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
9115           J++;
9116         }
9117         RecipeBuilder.getRecipe(Member)->eraseFromParent();
9118       }
9119   }
9120 
9121   // From this point onwards, VPlan-to-VPlan transformations may change the plan
9122   // in ways that accessing values using original IR values is incorrect.
9123   Plan->disableValue2VPValue();
9124 
9125   VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE());
9126   VPlanTransforms::sinkScalarOperands(*Plan);
9127   VPlanTransforms::mergeReplicateRegions(*Plan);
9128   VPlanTransforms::removeDeadRecipes(*Plan, *OrigLoop);
9129 
9130   std::string PlanName;
9131   raw_string_ostream RSO(PlanName);
9132   ElementCount VF = Range.Start;
9133   Plan->addVF(VF);
9134   RSO << "Initial VPlan for VF={" << VF;
9135   for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
9136     Plan->addVF(VF);
9137     RSO << "," << VF;
9138   }
9139   RSO << "},UF>=1";
9140   RSO.flush();
9141   Plan->setName(PlanName);
9142 
9143   // Fold Exit block into its predecessor if possible.
9144   // TODO: Fold block earlier once all VPlan transforms properly maintain a
9145   // VPBasicBlock as exit.
9146   VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExit());
9147 
9148   assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
9149   return Plan;
9150 }
9151 
9152 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9153   // Outer loop handling: They may require CFG and instruction level
9154   // transformations before even evaluating whether vectorization is profitable.
9155   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
9156   // the vectorization pipeline.
9157   assert(!OrigLoop->isInnermost());
9158   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
9159 
9160   // Create new empty VPlan
9161   auto Plan = std::make_unique<VPlan>();
9162 
9163   // Build hierarchical CFG
9164   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
9165   HCFGBuilder.buildHierarchicalCFG();
9166 
9167   for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
9168        VF *= 2)
9169     Plan->addVF(VF);
9170 
9171   if (EnableVPlanPredication) {
9172     VPlanPredicator VPP(*Plan);
9173     VPP.predicate();
9174 
9175     // Avoid running transformation to recipes until masked code generation in
9176     // VPlan-native path is in place.
9177     return Plan;
9178   }
9179 
9180   SmallPtrSet<Instruction *, 1> DeadInstructions;
9181   VPlanTransforms::VPInstructionsToVPRecipes(
9182       OrigLoop, Plan,
9183       [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
9184       DeadInstructions, *PSE.getSE());
9185 
9186   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(),
9187                         true, true);
9188   return Plan;
9189 }
9190 
9191 // Adjust the recipes for reductions. For in-loop reductions the chain of
9192 // instructions leading from the loop exit instr to the phi need to be converted
9193 // to reductions, with one operand being vector and the other being the scalar
9194 // reduction chain. For other reductions, a select is introduced between the phi
9195 // and live-out recipes when folding the tail.
9196 void LoopVectorizationPlanner::adjustRecipesForReductions(
9197     VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
9198     ElementCount MinVF) {
9199   for (auto &Reduction : CM.getInLoopReductionChains()) {
9200     PHINode *Phi = Reduction.first;
9201     const RecurrenceDescriptor &RdxDesc =
9202         Legal->getReductionVars().find(Phi)->second;
9203     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
9204 
9205     if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc))
9206       continue;
9207 
9208     // ReductionOperations are orders top-down from the phi's use to the
9209     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
9210     // which of the two operands will remain scalar and which will be reduced.
9211     // For minmax the chain will be the select instructions.
9212     Instruction *Chain = Phi;
9213     for (Instruction *R : ReductionOperations) {
9214       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
9215       RecurKind Kind = RdxDesc.getRecurrenceKind();
9216 
9217       VPValue *ChainOp = Plan->getVPValue(Chain);
9218       unsigned FirstOpId;
9219       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
9220              "Only min/max recurrences allowed for inloop reductions");
9221       // Recognize a call to the llvm.fmuladd intrinsic.
9222       bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9223       assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) &&
9224              "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9225       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9226         assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
9227                "Expected to replace a VPWidenSelectSC");
9228         FirstOpId = 1;
9229       } else {
9230         assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) ||
9231                 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) &&
9232                "Expected to replace a VPWidenSC");
9233         FirstOpId = 0;
9234       }
9235       unsigned VecOpId =
9236           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
9237       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
9238 
9239       auto *CondOp = CM.blockNeedsPredicationForAnyReason(R->getParent())
9240                          ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
9241                          : nullptr;
9242 
9243       if (IsFMulAdd) {
9244         // If the instruction is a call to the llvm.fmuladd intrinsic then we
9245         // need to create an fmul recipe to use as the vector operand for the
9246         // fadd reduction.
9247         VPInstruction *FMulRecipe = new VPInstruction(
9248             Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))});
9249         FMulRecipe->setFastMathFlags(R->getFastMathFlags());
9250         WidenRecipe->getParent()->insert(FMulRecipe,
9251                                          WidenRecipe->getIterator());
9252         VecOp = FMulRecipe;
9253       }
9254       VPReductionRecipe *RedRecipe =
9255           new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI);
9256       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9257       Plan->removeVPValueFor(R);
9258       Plan->addVPValue(R, RedRecipe);
9259       WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
9260       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9261       WidenRecipe->eraseFromParent();
9262 
9263       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9264         VPRecipeBase *CompareRecipe =
9265             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
9266         assert(isa<VPWidenRecipe>(CompareRecipe) &&
9267                "Expected to replace a VPWidenSC");
9268         assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 &&
9269                "Expected no remaining users");
9270         CompareRecipe->eraseFromParent();
9271       }
9272       Chain = R;
9273     }
9274   }
9275 
9276   // If tail is folded by masking, introduce selects between the phi
9277   // and the live-out instruction of each reduction, at the beginning of the
9278   // dedicated latch block.
9279   if (CM.foldTailByMasking()) {
9280     Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin());
9281     for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) {
9282       VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9283       if (!PhiR || PhiR->isInLoop())
9284         continue;
9285       VPValue *Cond =
9286           RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
9287       VPValue *Red = PhiR->getBackedgeValue();
9288       assert(cast<VPRecipeBase>(Red->getDef())->getParent() != LatchVPBB &&
9289              "reduction recipe must be defined before latch");
9290       Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR});
9291     }
9292   }
9293 }
9294 
9295 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
9296 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
9297                                VPSlotTracker &SlotTracker) const {
9298   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9299   IG->getInsertPos()->printAsOperand(O, false);
9300   O << ", ";
9301   getAddr()->printAsOperand(O, SlotTracker);
9302   VPValue *Mask = getMask();
9303   if (Mask) {
9304     O << ", ";
9305     Mask->printAsOperand(O, SlotTracker);
9306   }
9307 
9308   unsigned OpIdx = 0;
9309   for (unsigned i = 0; i < IG->getFactor(); ++i) {
9310     if (!IG->getMember(i))
9311       continue;
9312     if (getNumStoreOperands() > 0) {
9313       O << "\n" << Indent << "  store ";
9314       getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
9315       O << " to index " << i;
9316     } else {
9317       O << "\n" << Indent << "  ";
9318       getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
9319       O << " = load from index " << i;
9320     }
9321     ++OpIdx;
9322   }
9323 }
9324 #endif
9325 
9326 void VPWidenCallRecipe::execute(VPTransformState &State) {
9327   State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
9328                                   *this, State);
9329 }
9330 
9331 void VPWidenSelectRecipe::execute(VPTransformState &State) {
9332   auto &I = *cast<SelectInst>(getUnderlyingInstr());
9333   State.ILV->setDebugLocFromInst(&I);
9334 
9335   // The condition can be loop invariant  but still defined inside the
9336   // loop. This means that we can't just use the original 'cond' value.
9337   // We have to take the 'vectorized' value and pick the first lane.
9338   // Instcombine will make this a no-op.
9339   auto *InvarCond =
9340       InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr;
9341 
9342   for (unsigned Part = 0; Part < State.UF; ++Part) {
9343     Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part);
9344     Value *Op0 = State.get(getOperand(1), Part);
9345     Value *Op1 = State.get(getOperand(2), Part);
9346     Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
9347     State.set(this, Sel, Part);
9348     State.ILV->addMetadata(Sel, &I);
9349   }
9350 }
9351 
9352 void VPWidenRecipe::execute(VPTransformState &State) {
9353   auto &I = *cast<Instruction>(getUnderlyingValue());
9354   auto &Builder = State.Builder;
9355   switch (I.getOpcode()) {
9356   case Instruction::Call:
9357   case Instruction::Br:
9358   case Instruction::PHI:
9359   case Instruction::GetElementPtr:
9360   case Instruction::Select:
9361     llvm_unreachable("This instruction is handled by a different recipe.");
9362   case Instruction::UDiv:
9363   case Instruction::SDiv:
9364   case Instruction::SRem:
9365   case Instruction::URem:
9366   case Instruction::Add:
9367   case Instruction::FAdd:
9368   case Instruction::Sub:
9369   case Instruction::FSub:
9370   case Instruction::FNeg:
9371   case Instruction::Mul:
9372   case Instruction::FMul:
9373   case Instruction::FDiv:
9374   case Instruction::FRem:
9375   case Instruction::Shl:
9376   case Instruction::LShr:
9377   case Instruction::AShr:
9378   case Instruction::And:
9379   case Instruction::Or:
9380   case Instruction::Xor: {
9381     // Just widen unops and binops.
9382     State.ILV->setDebugLocFromInst(&I);
9383 
9384     for (unsigned Part = 0; Part < State.UF; ++Part) {
9385       SmallVector<Value *, 2> Ops;
9386       for (VPValue *VPOp : operands())
9387         Ops.push_back(State.get(VPOp, Part));
9388 
9389       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
9390 
9391       if (auto *VecOp = dyn_cast<Instruction>(V)) {
9392         VecOp->copyIRFlags(&I);
9393 
9394         // If the instruction is vectorized and was in a basic block that needed
9395         // predication, we can't propagate poison-generating flags (nuw/nsw,
9396         // exact, etc.). The control flow has been linearized and the
9397         // instruction is no longer guarded by the predicate, which could make
9398         // the flag properties to no longer hold.
9399         if (State.MayGeneratePoisonRecipes.contains(this))
9400           VecOp->dropPoisonGeneratingFlags();
9401       }
9402 
9403       // Use this vector value for all users of the original instruction.
9404       State.set(this, V, Part);
9405       State.ILV->addMetadata(V, &I);
9406     }
9407 
9408     break;
9409   }
9410   case Instruction::ICmp:
9411   case Instruction::FCmp: {
9412     // Widen compares. Generate vector compares.
9413     bool FCmp = (I.getOpcode() == Instruction::FCmp);
9414     auto *Cmp = cast<CmpInst>(&I);
9415     State.ILV->setDebugLocFromInst(Cmp);
9416     for (unsigned Part = 0; Part < State.UF; ++Part) {
9417       Value *A = State.get(getOperand(0), Part);
9418       Value *B = State.get(getOperand(1), Part);
9419       Value *C = nullptr;
9420       if (FCmp) {
9421         // Propagate fast math flags.
9422         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
9423         Builder.setFastMathFlags(Cmp->getFastMathFlags());
9424         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
9425       } else {
9426         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
9427       }
9428       State.set(this, C, Part);
9429       State.ILV->addMetadata(C, &I);
9430     }
9431 
9432     break;
9433   }
9434 
9435   case Instruction::ZExt:
9436   case Instruction::SExt:
9437   case Instruction::FPToUI:
9438   case Instruction::FPToSI:
9439   case Instruction::FPExt:
9440   case Instruction::PtrToInt:
9441   case Instruction::IntToPtr:
9442   case Instruction::SIToFP:
9443   case Instruction::UIToFP:
9444   case Instruction::Trunc:
9445   case Instruction::FPTrunc:
9446   case Instruction::BitCast: {
9447     auto *CI = cast<CastInst>(&I);
9448     State.ILV->setDebugLocFromInst(CI);
9449 
9450     /// Vectorize casts.
9451     Type *DestTy = (State.VF.isScalar())
9452                        ? CI->getType()
9453                        : VectorType::get(CI->getType(), State.VF);
9454 
9455     for (unsigned Part = 0; Part < State.UF; ++Part) {
9456       Value *A = State.get(getOperand(0), Part);
9457       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
9458       State.set(this, Cast, Part);
9459       State.ILV->addMetadata(Cast, &I);
9460     }
9461     break;
9462   }
9463   default:
9464     // This instruction is not vectorized by simple widening.
9465     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
9466     llvm_unreachable("Unhandled instruction!");
9467   } // end of switch.
9468 }
9469 
9470 void VPWidenGEPRecipe::execute(VPTransformState &State) {
9471   auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
9472   // Construct a vector GEP by widening the operands of the scalar GEP as
9473   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
9474   // results in a vector of pointers when at least one operand of the GEP
9475   // is vector-typed. Thus, to keep the representation compact, we only use
9476   // vector-typed operands for loop-varying values.
9477 
9478   if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
9479     // If we are vectorizing, but the GEP has only loop-invariant operands,
9480     // the GEP we build (by only using vector-typed operands for
9481     // loop-varying values) would be a scalar pointer. Thus, to ensure we
9482     // produce a vector of pointers, we need to either arbitrarily pick an
9483     // operand to broadcast, or broadcast a clone of the original GEP.
9484     // Here, we broadcast a clone of the original.
9485     //
9486     // TODO: If at some point we decide to scalarize instructions having
9487     //       loop-invariant operands, this special case will no longer be
9488     //       required. We would add the scalarization decision to
9489     //       collectLoopScalars() and teach getVectorValue() to broadcast
9490     //       the lane-zero scalar value.
9491     auto *Clone = State.Builder.Insert(GEP->clone());
9492     for (unsigned Part = 0; Part < State.UF; ++Part) {
9493       Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone);
9494       State.set(this, EntryPart, Part);
9495       State.ILV->addMetadata(EntryPart, GEP);
9496     }
9497   } else {
9498     // If the GEP has at least one loop-varying operand, we are sure to
9499     // produce a vector of pointers. But if we are only unrolling, we want
9500     // to produce a scalar GEP for each unroll part. Thus, the GEP we
9501     // produce with the code below will be scalar (if VF == 1) or vector
9502     // (otherwise). Note that for the unroll-only case, we still maintain
9503     // values in the vector mapping with initVector, as we do for other
9504     // instructions.
9505     for (unsigned Part = 0; Part < State.UF; ++Part) {
9506       // The pointer operand of the new GEP. If it's loop-invariant, we
9507       // won't broadcast it.
9508       auto *Ptr = IsPtrLoopInvariant
9509                       ? State.get(getOperand(0), VPIteration(0, 0))
9510                       : State.get(getOperand(0), Part);
9511 
9512       // Collect all the indices for the new GEP. If any index is
9513       // loop-invariant, we won't broadcast it.
9514       SmallVector<Value *, 4> Indices;
9515       for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
9516         VPValue *Operand = getOperand(I);
9517         if (IsIndexLoopInvariant[I - 1])
9518           Indices.push_back(State.get(Operand, VPIteration(0, 0)));
9519         else
9520           Indices.push_back(State.get(Operand, Part));
9521       }
9522 
9523       // If the GEP instruction is vectorized and was in a basic block that
9524       // needed predication, we can't propagate the poison-generating 'inbounds'
9525       // flag. The control flow has been linearized and the GEP is no longer
9526       // guarded by the predicate, which could make the 'inbounds' properties to
9527       // no longer hold.
9528       bool IsInBounds =
9529           GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0;
9530 
9531       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
9532       // but it should be a vector, otherwise.
9533       auto *NewGEP = IsInBounds
9534                          ? State.Builder.CreateInBoundsGEP(
9535                                GEP->getSourceElementType(), Ptr, Indices)
9536                          : State.Builder.CreateGEP(GEP->getSourceElementType(),
9537                                                    Ptr, Indices);
9538       assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
9539              "NewGEP is not a pointer vector");
9540       State.set(this, NewGEP, Part);
9541       State.ILV->addMetadata(NewGEP, GEP);
9542     }
9543   }
9544 }
9545 
9546 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
9547   assert(!State.Instance && "Int or FP induction being replicated.");
9548 
9549   Value *Start = getStartValue()->getLiveInIRValue();
9550   const InductionDescriptor &ID = getInductionDescriptor();
9551   TruncInst *Trunc = getTruncInst();
9552   IRBuilderBase &Builder = State.Builder;
9553   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
9554   assert(State.VF.isVector() && "must have vector VF");
9555 
9556   // The value from the original loop to which we are mapping the new induction
9557   // variable.
9558   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
9559 
9560   auto &DL = EntryVal->getModule()->getDataLayout();
9561 
9562   // Generate code for the induction step. Note that induction steps are
9563   // required to be loop-invariant
9564   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
9565     if (SE.isSCEVable(IV->getType())) {
9566       SCEVExpander Exp(SE, DL, "induction");
9567       return Exp.expandCodeFor(Step, Step->getType(),
9568                                State.CFG.VectorPreHeader->getTerminator());
9569     }
9570     return cast<SCEVUnknown>(Step)->getValue();
9571   };
9572 
9573   // Fast-math-flags propagate from the original induction instruction.
9574   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
9575   if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
9576     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
9577 
9578   // Now do the actual transformations, and start with creating the step value.
9579   Value *Step = CreateStepValue(ID.getStep());
9580 
9581   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
9582          "Expected either an induction phi-node or a truncate of it!");
9583 
9584   // Construct the initial value of the vector IV in the vector loop preheader
9585   auto CurrIP = Builder.saveIP();
9586   Builder.SetInsertPoint(State.CFG.VectorPreHeader->getTerminator());
9587   if (isa<TruncInst>(EntryVal)) {
9588     assert(Start->getType()->isIntegerTy() &&
9589            "Truncation requires an integer type");
9590     auto *TruncType = cast<IntegerType>(EntryVal->getType());
9591     Step = Builder.CreateTrunc(Step, TruncType);
9592     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
9593   }
9594 
9595   Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
9596   Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
9597   Value *SteppedStart = getStepVector(
9598       SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);
9599 
9600   // We create vector phi nodes for both integer and floating-point induction
9601   // variables. Here, we determine the kind of arithmetic we will perform.
9602   Instruction::BinaryOps AddOp;
9603   Instruction::BinaryOps MulOp;
9604   if (Step->getType()->isIntegerTy()) {
9605     AddOp = Instruction::Add;
9606     MulOp = Instruction::Mul;
9607   } else {
9608     AddOp = ID.getInductionOpcode();
9609     MulOp = Instruction::FMul;
9610   }
9611 
9612   // Multiply the vectorization factor by the step using integer or
9613   // floating-point arithmetic as appropriate.
9614   Type *StepType = Step->getType();
9615   Value *RuntimeVF;
9616   if (Step->getType()->isFloatingPointTy())
9617     RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
9618   else
9619     RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
9620   Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
9621 
9622   // Create a vector splat to use in the induction update.
9623   //
9624   // FIXME: If the step is non-constant, we create the vector splat with
9625   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
9626   //        handle a constant vector splat.
9627   Value *SplatVF = isa<Constant>(Mul)
9628                        ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
9629                        : Builder.CreateVectorSplat(State.VF, Mul);
9630   Builder.restoreIP(CurrIP);
9631 
9632   // We may need to add the step a number of times, depending on the unroll
9633   // factor. The last of those goes into the PHI.
9634   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
9635                                     &*State.CFG.PrevBB->getFirstInsertionPt());
9636   VecInd->setDebugLoc(EntryVal->getDebugLoc());
9637   Instruction *LastInduction = VecInd;
9638   for (unsigned Part = 0; Part < State.UF; ++Part) {
9639     State.set(this, LastInduction, Part);
9640 
9641     if (isa<TruncInst>(EntryVal))
9642       State.ILV->addMetadata(LastInduction, EntryVal);
9643 
9644     LastInduction = cast<Instruction>(
9645         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
9646     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
9647   }
9648 
9649   // Move the last step to the end of the latch block. This ensures consistent
9650   // placement of all induction updates.
9651   auto *LoopVectorLatch =
9652       State.LI->getLoopFor(State.CFG.PrevBB)->getLoopLatch();
9653   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
9654   LastInduction->moveBefore(Br);
9655   LastInduction->setName("vec.ind.next");
9656 
9657   VecInd->addIncoming(SteppedStart, State.CFG.VectorPreHeader);
9658   VecInd->addIncoming(LastInduction, LoopVectorLatch);
9659 }
9660 
9661 void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
9662   assert(!State.Instance && "VPScalarIVStepsRecipe being replicated.");
9663 
9664   // Fast-math-flags propagate from the original induction instruction.
9665   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9666   if (IndDesc.getInductionBinOp() &&
9667       isa<FPMathOperator>(IndDesc.getInductionBinOp()))
9668     State.Builder.setFastMathFlags(
9669         IndDesc.getInductionBinOp()->getFastMathFlags());
9670 
9671   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9672   auto CreateScalarIV = [&](Value *&Step) -> Value * {
9673     Value *ScalarIV = State.get(getCanonicalIV(), VPIteration(0, 0));
9674     auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0);
9675     if (!isCanonical() || CanonicalIV->getType() != Ty) {
9676       ScalarIV =
9677           Ty->isIntegerTy()
9678               ? State.Builder.CreateSExtOrTrunc(ScalarIV, Ty)
9679               : State.Builder.CreateCast(Instruction::SIToFP, ScalarIV, Ty);
9680       ScalarIV = emitTransformedIndex(State.Builder, ScalarIV,
9681                                       getStartValue()->getLiveInIRValue(), Step,
9682                                       IndDesc);
9683       ScalarIV->setName("offset.idx");
9684     }
9685     if (TruncToTy) {
9686       assert(Step->getType()->isIntegerTy() &&
9687              "Truncation requires an integer step");
9688       ScalarIV = State.Builder.CreateTrunc(ScalarIV, TruncToTy);
9689       Step = State.Builder.CreateTrunc(Step, TruncToTy);
9690     }
9691     return ScalarIV;
9692   };
9693 
9694   Value *ScalarIV = CreateScalarIV(Step);
9695   if (State.VF.isVector()) {
9696     buildScalarSteps(ScalarIV, Step, IndDesc, this, State);
9697     return;
9698   }
9699 
9700   for (unsigned Part = 0; Part < State.UF; ++Part) {
9701     assert(!State.VF.isScalable() && "scalable vectors not yet supported.");
9702     Value *EntryPart;
9703     if (Step->getType()->isFloatingPointTy()) {
9704       Value *StartIdx =
9705           getRuntimeVFAsFloat(State.Builder, Step->getType(), State.VF * Part);
9706       // Floating-point operations inherit FMF via the builder's flags.
9707       Value *MulOp = State.Builder.CreateFMul(StartIdx, Step);
9708       EntryPart = State.Builder.CreateBinOp(IndDesc.getInductionOpcode(),
9709                                             ScalarIV, MulOp);
9710     } else {
9711       Value *StartIdx =
9712           getRuntimeVF(State.Builder, Step->getType(), State.VF * Part);
9713       EntryPart = State.Builder.CreateAdd(
9714           ScalarIV, State.Builder.CreateMul(StartIdx, Step), "induction");
9715     }
9716     State.set(this, EntryPart, Part);
9717   }
9718 }
9719 
9720 void VPWidenPHIRecipe::execute(VPTransformState &State) {
9721   State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this,
9722                                  State);
9723 }
9724 
9725 void VPBlendRecipe::execute(VPTransformState &State) {
9726   State.ILV->setDebugLocFromInst(Phi, &State.Builder);
9727   // We know that all PHIs in non-header blocks are converted into
9728   // selects, so we don't have to worry about the insertion order and we
9729   // can just use the builder.
9730   // At this point we generate the predication tree. There may be
9731   // duplications since this is a simple recursive scan, but future
9732   // optimizations will clean it up.
9733 
9734   unsigned NumIncoming = getNumIncomingValues();
9735 
9736   // Generate a sequence of selects of the form:
9737   // SELECT(Mask3, In3,
9738   //        SELECT(Mask2, In2,
9739   //               SELECT(Mask1, In1,
9740   //                      In0)))
9741   // Note that Mask0 is never used: lanes for which no path reaches this phi and
9742   // are essentially undef are taken from In0.
9743   InnerLoopVectorizer::VectorParts Entry(State.UF);
9744   for (unsigned In = 0; In < NumIncoming; ++In) {
9745     for (unsigned Part = 0; Part < State.UF; ++Part) {
9746       // We might have single edge PHIs (blocks) - use an identity
9747       // 'select' for the first PHI operand.
9748       Value *In0 = State.get(getIncomingValue(In), Part);
9749       if (In == 0)
9750         Entry[Part] = In0; // Initialize with the first incoming value.
9751       else {
9752         // Select between the current value and the previous incoming edge
9753         // based on the incoming mask.
9754         Value *Cond = State.get(getMask(In), Part);
9755         Entry[Part] =
9756             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
9757       }
9758     }
9759   }
9760   for (unsigned Part = 0; Part < State.UF; ++Part)
9761     State.set(this, Entry[Part], Part);
9762 }
9763 
9764 void VPInterleaveRecipe::execute(VPTransformState &State) {
9765   assert(!State.Instance && "Interleave group being replicated.");
9766   State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9767                                       getStoredValues(), getMask());
9768 }
9769 
9770 void VPReductionRecipe::execute(VPTransformState &State) {
9771   assert(!State.Instance && "Reduction being replicated.");
9772   Value *PrevInChain = State.get(getChainOp(), 0);
9773   RecurKind Kind = RdxDesc->getRecurrenceKind();
9774   bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc);
9775   // Propagate the fast-math flags carried by the underlying instruction.
9776   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
9777   State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags());
9778   for (unsigned Part = 0; Part < State.UF; ++Part) {
9779     Value *NewVecOp = State.get(getVecOp(), Part);
9780     if (VPValue *Cond = getCondOp()) {
9781       Value *NewCond = State.get(Cond, Part);
9782       VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
9783       Value *Iden = RdxDesc->getRecurrenceIdentity(
9784           Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags());
9785       Value *IdenVec =
9786           State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
9787       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
9788       NewVecOp = Select;
9789     }
9790     Value *NewRed;
9791     Value *NextInChain;
9792     if (IsOrdered) {
9793       if (State.VF.isVector())
9794         NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp,
9795                                         PrevInChain);
9796       else
9797         NewRed = State.Builder.CreateBinOp(
9798             (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain,
9799             NewVecOp);
9800       PrevInChain = NewRed;
9801     } else {
9802       PrevInChain = State.get(getChainOp(), Part);
9803       NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
9804     }
9805     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9806       NextInChain =
9807           createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(),
9808                          NewRed, PrevInChain);
9809     } else if (IsOrdered)
9810       NextInChain = NewRed;
9811     else
9812       NextInChain = State.Builder.CreateBinOp(
9813           (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed,
9814           PrevInChain);
9815     State.set(this, NextInChain, Part);
9816   }
9817 }
9818 
9819 void VPReplicateRecipe::execute(VPTransformState &State) {
9820   if (State.Instance) { // Generate a single instance.
9821     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9822     State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance,
9823                                     IsPredicated, State);
9824     // Insert scalar instance packing it into a vector.
9825     if (AlsoPack && State.VF.isVector()) {
9826       // If we're constructing lane 0, initialize to start from poison.
9827       if (State.Instance->Lane.isFirstLane()) {
9828         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9829         Value *Poison = PoisonValue::get(
9830             VectorType::get(getUnderlyingValue()->getType(), State.VF));
9831         State.set(this, Poison, State.Instance->Part);
9832       }
9833       State.ILV->packScalarIntoVectorValue(this, *State.Instance, State);
9834     }
9835     return;
9836   }
9837 
9838   // Generate scalar instances for all VF lanes of all UF parts, unless the
9839   // instruction is uniform inwhich case generate only the first lane for each
9840   // of the UF parts.
9841   unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
9842   assert((!State.VF.isScalable() || IsUniform) &&
9843          "Can't scalarize a scalable vector");
9844   for (unsigned Part = 0; Part < State.UF; ++Part)
9845     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9846       State.ILV->scalarizeInstruction(getUnderlyingInstr(), this,
9847                                       VPIteration(Part, Lane), IsPredicated,
9848                                       State);
9849 }
9850 
9851 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
9852   assert(State.Instance && "Branch on Mask works only on single instance.");
9853 
9854   unsigned Part = State.Instance->Part;
9855   unsigned Lane = State.Instance->Lane.getKnownLane();
9856 
9857   Value *ConditionBit = nullptr;
9858   VPValue *BlockInMask = getMask();
9859   if (BlockInMask) {
9860     ConditionBit = State.get(BlockInMask, Part);
9861     if (ConditionBit->getType()->isVectorTy())
9862       ConditionBit = State.Builder.CreateExtractElement(
9863           ConditionBit, State.Builder.getInt32(Lane));
9864   } else // Block in mask is all-one.
9865     ConditionBit = State.Builder.getTrue();
9866 
9867   // Replace the temporary unreachable terminator with a new conditional branch,
9868   // whose two destinations will be set later when they are created.
9869   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
9870   assert(isa<UnreachableInst>(CurrentTerminator) &&
9871          "Expected to replace unreachable terminator with conditional branch.");
9872   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
9873   CondBr->setSuccessor(0, nullptr);
9874   ReplaceInstWithInst(CurrentTerminator, CondBr);
9875 }
9876 
9877 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
9878   assert(State.Instance && "Predicated instruction PHI works per instance.");
9879   Instruction *ScalarPredInst =
9880       cast<Instruction>(State.get(getOperand(0), *State.Instance));
9881   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
9882   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
9883   assert(PredicatingBB && "Predicated block has no single predecessor.");
9884   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
9885          "operand must be VPReplicateRecipe");
9886 
9887   // By current pack/unpack logic we need to generate only a single phi node: if
9888   // a vector value for the predicated instruction exists at this point it means
9889   // the instruction has vector users only, and a phi for the vector value is
9890   // needed. In this case the recipe of the predicated instruction is marked to
9891   // also do that packing, thereby "hoisting" the insert-element sequence.
9892   // Otherwise, a phi node for the scalar value is needed.
9893   unsigned Part = State.Instance->Part;
9894   if (State.hasVectorValue(getOperand(0), Part)) {
9895     Value *VectorValue = State.get(getOperand(0), Part);
9896     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
9897     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
9898     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
9899     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
9900     if (State.hasVectorValue(this, Part))
9901       State.reset(this, VPhi, Part);
9902     else
9903       State.set(this, VPhi, Part);
9904     // NOTE: Currently we need to update the value of the operand, so the next
9905     // predicated iteration inserts its generated value in the correct vector.
9906     State.reset(getOperand(0), VPhi, Part);
9907   } else {
9908     Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
9909     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
9910     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
9911                      PredicatingBB);
9912     Phi->addIncoming(ScalarPredInst, PredicatedBB);
9913     if (State.hasScalarValue(this, *State.Instance))
9914       State.reset(this, Phi, *State.Instance);
9915     else
9916       State.set(this, Phi, *State.Instance);
9917     // NOTE: Currently we need to update the value of the operand, so the next
9918     // predicated iteration inserts its generated value in the correct vector.
9919     State.reset(getOperand(0), Phi, *State.Instance);
9920   }
9921 }
9922 
9923 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
9924   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
9925 
9926   // Attempt to issue a wide load.
9927   LoadInst *LI = dyn_cast<LoadInst>(&Ingredient);
9928   StoreInst *SI = dyn_cast<StoreInst>(&Ingredient);
9929 
9930   assert((LI || SI) && "Invalid Load/Store instruction");
9931   assert((!SI || StoredValue) && "No stored value provided for widened store");
9932   assert((!LI || !StoredValue) && "Stored value provided for widened load");
9933 
9934   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9935 
9936   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9937   const Align Alignment = getLoadStoreAlignment(&Ingredient);
9938   bool CreateGatherScatter = !Consecutive;
9939 
9940   auto &Builder = State.Builder;
9941   InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);
9942   bool isMaskRequired = getMask();
9943   if (isMaskRequired)
9944     for (unsigned Part = 0; Part < State.UF; ++Part)
9945       BlockInMaskParts[Part] = State.get(getMask(), Part);
9946 
9947   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
9948     // Calculate the pointer for the specific unroll-part.
9949     GetElementPtrInst *PartPtr = nullptr;
9950 
9951     bool InBounds = false;
9952     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
9953       InBounds = gep->isInBounds();
9954     if (Reverse) {
9955       // If the address is consecutive but reversed, then the
9956       // wide store needs to start at the last vector element.
9957       // RunTimeVF =  VScale * VF.getKnownMinValue()
9958       // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
9959       Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF);
9960       // NumElt = -Part * RunTimeVF
9961       Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
9962       // LastLane = 1 - RunTimeVF
9963       Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
9964       PartPtr =
9965           cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
9966       PartPtr->setIsInBounds(InBounds);
9967       PartPtr = cast<GetElementPtrInst>(
9968           Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
9969       PartPtr->setIsInBounds(InBounds);
9970       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
9971         BlockInMaskParts[Part] =
9972             Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse");
9973     } else {
9974       Value *Increment =
9975           createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part);
9976       PartPtr = cast<GetElementPtrInst>(
9977           Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
9978       PartPtr->setIsInBounds(InBounds);
9979     }
9980 
9981     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
9982     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
9983   };
9984 
9985   // Handle Stores:
9986   if (SI) {
9987     State.ILV->setDebugLocFromInst(SI);
9988 
9989     for (unsigned Part = 0; Part < State.UF; ++Part) {
9990       Instruction *NewSI = nullptr;
9991       Value *StoredVal = State.get(StoredValue, Part);
9992       if (CreateGatherScatter) {
9993         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9994         Value *VectorGep = State.get(getAddr(), Part);
9995         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
9996                                             MaskPart);
9997       } else {
9998         if (Reverse) {
9999           // If we store to reverse consecutive memory locations, then we need
10000           // to reverse the order of elements in the stored value.
10001           StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
10002           // We don't want to update the value in the map as it might be used in
10003           // another expression. So don't call resetVectorValue(StoredVal).
10004         }
10005         auto *VecPtr =
10006             CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
10007         if (isMaskRequired)
10008           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
10009                                             BlockInMaskParts[Part]);
10010         else
10011           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
10012       }
10013       State.ILV->addMetadata(NewSI, SI);
10014     }
10015     return;
10016   }
10017 
10018   // Handle loads.
10019   assert(LI && "Must have a load instruction");
10020   State.ILV->setDebugLocFromInst(LI);
10021   for (unsigned Part = 0; Part < State.UF; ++Part) {
10022     Value *NewLI;
10023     if (CreateGatherScatter) {
10024       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
10025       Value *VectorGep = State.get(getAddr(), Part);
10026       NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
10027                                          nullptr, "wide.masked.gather");
10028       State.ILV->addMetadata(NewLI, LI);
10029     } else {
10030       auto *VecPtr =
10031           CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
10032       if (isMaskRequired)
10033         NewLI = Builder.CreateMaskedLoad(
10034             DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
10035             PoisonValue::get(DataTy), "wide.masked.load");
10036       else
10037         NewLI =
10038             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
10039 
10040       // Add metadata to the load, but setVectorValue to the reverse shuffle.
10041       State.ILV->addMetadata(NewLI, LI);
10042       if (Reverse)
10043         NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
10044     }
10045 
10046     State.set(this, NewLI, Part);
10047   }
10048 }
10049 
10050 // Determine how to lower the scalar epilogue, which depends on 1) optimising
10051 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
10052 // predication, and 4) a TTI hook that analyses whether the loop is suitable
10053 // for predication.
10054 static ScalarEpilogueLowering getScalarEpilogueLowering(
10055     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
10056     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
10057     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
10058     LoopVectorizationLegality &LVL) {
10059   // 1) OptSize takes precedence over all other options, i.e. if this is set,
10060   // don't look at hints or options, and don't request a scalar epilogue.
10061   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
10062   // LoopAccessInfo (due to code dependency and not being able to reliably get
10063   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
10064   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
10065   // versioning when the vectorization is forced, unlike hasOptSize. So revert
10066   // back to the old way and vectorize with versioning when forced. See D81345.)
10067   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
10068                                                       PGSOQueryType::IRPass) &&
10069                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
10070     return CM_ScalarEpilogueNotAllowedOptSize;
10071 
10072   // 2) If set, obey the directives
10073   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
10074     switch (PreferPredicateOverEpilogue) {
10075     case PreferPredicateTy::ScalarEpilogue:
10076       return CM_ScalarEpilogueAllowed;
10077     case PreferPredicateTy::PredicateElseScalarEpilogue:
10078       return CM_ScalarEpilogueNotNeededUsePredicate;
10079     case PreferPredicateTy::PredicateOrDontVectorize:
10080       return CM_ScalarEpilogueNotAllowedUsePredicate;
10081     };
10082   }
10083 
10084   // 3) If set, obey the hints
10085   switch (Hints.getPredicate()) {
10086   case LoopVectorizeHints::FK_Enabled:
10087     return CM_ScalarEpilogueNotNeededUsePredicate;
10088   case LoopVectorizeHints::FK_Disabled:
10089     return CM_ScalarEpilogueAllowed;
10090   };
10091 
10092   // 4) if the TTI hook indicates this is profitable, request predication.
10093   if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
10094                                        LVL.getLAI()))
10095     return CM_ScalarEpilogueNotNeededUsePredicate;
10096 
10097   return CM_ScalarEpilogueAllowed;
10098 }
10099 
10100 Value *VPTransformState::get(VPValue *Def, unsigned Part) {
10101   // If Values have been set for this Def return the one relevant for \p Part.
10102   if (hasVectorValue(Def, Part))
10103     return Data.PerPartOutput[Def][Part];
10104 
10105   if (!hasScalarValue(Def, {Part, 0})) {
10106     Value *IRV = Def->getLiveInIRValue();
10107     Value *B = ILV->getBroadcastInstrs(IRV);
10108     set(Def, B, Part);
10109     return B;
10110   }
10111 
10112   Value *ScalarValue = get(Def, {Part, 0});
10113   // If we aren't vectorizing, we can just copy the scalar map values over
10114   // to the vector map.
10115   if (VF.isScalar()) {
10116     set(Def, ScalarValue, Part);
10117     return ScalarValue;
10118   }
10119 
10120   auto *RepR = dyn_cast<VPReplicateRecipe>(Def);
10121   bool IsUniform = RepR && RepR->isUniform();
10122 
10123   unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;
10124   // Check if there is a scalar value for the selected lane.
10125   if (!hasScalarValue(Def, {Part, LastLane})) {
10126     // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform.
10127     assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) ||
10128             isa<VPScalarIVStepsRecipe>(Def->getDef())) &&
10129            "unexpected recipe found to be invariant");
10130     IsUniform = true;
10131     LastLane = 0;
10132   }
10133 
10134   auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane}));
10135   // Set the insert point after the last scalarized instruction or after the
10136   // last PHI, if LastInst is a PHI. This ensures the insertelement sequence
10137   // will directly follow the scalar definitions.
10138   auto OldIP = Builder.saveIP();
10139   auto NewIP =
10140       isa<PHINode>(LastInst)
10141           ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI())
10142           : std::next(BasicBlock::iterator(LastInst));
10143   Builder.SetInsertPoint(&*NewIP);
10144 
10145   // However, if we are vectorizing, we need to construct the vector values.
10146   // If the value is known to be uniform after vectorization, we can just
10147   // broadcast the scalar value corresponding to lane zero for each unroll
10148   // iteration. Otherwise, we construct the vector values using
10149   // insertelement instructions. Since the resulting vectors are stored in
10150   // State, we will only generate the insertelements once.
10151   Value *VectorValue = nullptr;
10152   if (IsUniform) {
10153     VectorValue = ILV->getBroadcastInstrs(ScalarValue);
10154     set(Def, VectorValue, Part);
10155   } else {
10156     // Initialize packing with insertelements to start from undef.
10157     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
10158     Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
10159     set(Def, Undef, Part);
10160     for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
10161       ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this);
10162     VectorValue = get(Def, Part);
10163   }
10164   Builder.restoreIP(OldIP);
10165   return VectorValue;
10166 }
10167 
10168 // Process the loop in the VPlan-native vectorization path. This path builds
10169 // VPlan upfront in the vectorization pipeline, which allows to apply
10170 // VPlan-to-VPlan transformations from the very beginning without modifying the
10171 // input LLVM IR.
10172 static bool processLoopInVPlanNativePath(
10173     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
10174     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
10175     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
10176     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
10177     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
10178     LoopVectorizationRequirements &Requirements) {
10179 
10180   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
10181     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
10182     return false;
10183   }
10184   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
10185   Function *F = L->getHeader()->getParent();
10186   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
10187 
10188   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10189       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
10190 
10191   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
10192                                 &Hints, IAI);
10193   // Use the planner for outer loop vectorization.
10194   // TODO: CM is not used at this point inside the planner. Turn CM into an
10195   // optional argument if we don't need it in the future.
10196   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints,
10197                                Requirements, ORE);
10198 
10199   // Get user vectorization factor.
10200   ElementCount UserVF = Hints.getWidth();
10201 
10202   CM.collectElementTypesForWidening();
10203 
10204   // Plan how to best vectorize, return the best VF and its cost.
10205   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
10206 
10207   // If we are stress testing VPlan builds, do not attempt to generate vector
10208   // code. Masked vector code generation support will follow soon.
10209   // Also, do not attempt to vectorize if no vector code will be produced.
10210   if (VPlanBuildStressTest || EnableVPlanPredication ||
10211       VectorizationFactor::Disabled() == VF)
10212     return false;
10213 
10214   VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10215 
10216   {
10217     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
10218                              F->getParent()->getDataLayout());
10219     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
10220                            &CM, BFI, PSI, Checks);
10221     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
10222                       << L->getHeader()->getParent()->getName() << "\"\n");
10223     LVP.executePlan(VF.Width, 1, BestPlan, LB, DT);
10224   }
10225 
10226   // Mark the loop as already vectorized to avoid vectorizing again.
10227   Hints.setAlreadyVectorized();
10228   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10229   return true;
10230 }
10231 
10232 // Emit a remark if there are stores to floats that required a floating point
10233 // extension. If the vectorized loop was generated with floating point there
10234 // will be a performance penalty from the conversion overhead and the change in
10235 // the vector width.
10236 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
10237   SmallVector<Instruction *, 4> Worklist;
10238   for (BasicBlock *BB : L->getBlocks()) {
10239     for (Instruction &Inst : *BB) {
10240       if (auto *S = dyn_cast<StoreInst>(&Inst)) {
10241         if (S->getValueOperand()->getType()->isFloatTy())
10242           Worklist.push_back(S);
10243       }
10244     }
10245   }
10246 
10247   // Traverse the floating point stores upwards searching, for floating point
10248   // conversions.
10249   SmallPtrSet<const Instruction *, 4> Visited;
10250   SmallPtrSet<const Instruction *, 4> EmittedRemark;
10251   while (!Worklist.empty()) {
10252     auto *I = Worklist.pop_back_val();
10253     if (!L->contains(I))
10254       continue;
10255     if (!Visited.insert(I).second)
10256       continue;
10257 
10258     // Emit a remark if the floating point store required a floating
10259     // point conversion.
10260     // TODO: More work could be done to identify the root cause such as a
10261     // constant or a function return type and point the user to it.
10262     if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
10263       ORE->emit([&]() {
10264         return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
10265                                           I->getDebugLoc(), L->getHeader())
10266                << "floating point conversion changes vector width. "
10267                << "Mixed floating point precision requires an up/down "
10268                << "cast that will negatively impact performance.";
10269       });
10270 
10271     for (Use &Op : I->operands())
10272       if (auto *OpI = dyn_cast<Instruction>(Op))
10273         Worklist.push_back(OpI);
10274   }
10275 }
10276 
10277 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
10278     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
10279                                !EnableLoopInterleaving),
10280       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
10281                               !EnableLoopVectorization) {}
10282 
10283 bool LoopVectorizePass::processLoop(Loop *L) {
10284   assert((EnableVPlanNativePath || L->isInnermost()) &&
10285          "VPlan-native path is not enabled. Only process inner loops.");
10286 
10287 #ifndef NDEBUG
10288   const std::string DebugLocStr = getDebugLocString(L);
10289 #endif /* NDEBUG */
10290 
10291   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
10292                     << L->getHeader()->getParent()->getName() << "' from "
10293                     << DebugLocStr << "\n");
10294 
10295   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
10296 
10297   LLVM_DEBUG(
10298       dbgs() << "LV: Loop hints:"
10299              << " force="
10300              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
10301                      ? "disabled"
10302                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
10303                             ? "enabled"
10304                             : "?"))
10305              << " width=" << Hints.getWidth()
10306              << " interleave=" << Hints.getInterleave() << "\n");
10307 
10308   // Function containing loop
10309   Function *F = L->getHeader()->getParent();
10310 
10311   // Looking at the diagnostic output is the only way to determine if a loop
10312   // was vectorized (other than looking at the IR or machine code), so it
10313   // is important to generate an optimization remark for each loop. Most of
10314   // these messages are generated as OptimizationRemarkAnalysis. Remarks
10315   // generated as OptimizationRemark and OptimizationRemarkMissed are
10316   // less verbose reporting vectorized loops and unvectorized loops that may
10317   // benefit from vectorization, respectively.
10318 
10319   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
10320     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
10321     return false;
10322   }
10323 
10324   PredicatedScalarEvolution PSE(*SE, *L);
10325 
10326   // Check if it is legal to vectorize the loop.
10327   LoopVectorizationRequirements Requirements;
10328   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
10329                                 &Requirements, &Hints, DB, AC, BFI, PSI);
10330   if (!LVL.canVectorize(EnableVPlanNativePath)) {
10331     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
10332     Hints.emitRemarkWithHints();
10333     return false;
10334   }
10335 
10336   // Check the function attributes and profiles to find out if this function
10337   // should be optimized for size.
10338   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10339       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
10340 
10341   // Entrance to the VPlan-native vectorization path. Outer loops are processed
10342   // here. They may require CFG and instruction level transformations before
10343   // even evaluating whether vectorization is profitable. Since we cannot modify
10344   // the incoming IR, we need to build VPlan upfront in the vectorization
10345   // pipeline.
10346   if (!L->isInnermost())
10347     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
10348                                         ORE, BFI, PSI, Hints, Requirements);
10349 
10350   assert(L->isInnermost() && "Inner loop expected.");
10351 
10352   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
10353   // count by optimizing for size, to minimize overheads.
10354   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
10355   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
10356     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
10357                       << "This loop is worth vectorizing only if no scalar "
10358                       << "iteration overheads are incurred.");
10359     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
10360       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
10361     else {
10362       LLVM_DEBUG(dbgs() << "\n");
10363       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
10364     }
10365   }
10366 
10367   // Check the function attributes to see if implicit floats are allowed.
10368   // FIXME: This check doesn't seem possibly correct -- what if the loop is
10369   // an integer loop and the vector instructions selected are purely integer
10370   // vector instructions?
10371   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
10372     reportVectorizationFailure(
10373         "Can't vectorize when the NoImplicitFloat attribute is used",
10374         "loop not vectorized due to NoImplicitFloat attribute",
10375         "NoImplicitFloat", ORE, L);
10376     Hints.emitRemarkWithHints();
10377     return false;
10378   }
10379 
10380   // Check if the target supports potentially unsafe FP vectorization.
10381   // FIXME: Add a check for the type of safety issue (denormal, signaling)
10382   // for the target we're vectorizing for, to make sure none of the
10383   // additional fp-math flags can help.
10384   if (Hints.isPotentiallyUnsafe() &&
10385       TTI->isFPVectorizationPotentiallyUnsafe()) {
10386     reportVectorizationFailure(
10387         "Potentially unsafe FP op prevents vectorization",
10388         "loop not vectorized due to unsafe FP support.",
10389         "UnsafeFP", ORE, L);
10390     Hints.emitRemarkWithHints();
10391     return false;
10392   }
10393 
10394   bool AllowOrderedReductions;
10395   // If the flag is set, use that instead and override the TTI behaviour.
10396   if (ForceOrderedReductions.getNumOccurrences() > 0)
10397     AllowOrderedReductions = ForceOrderedReductions;
10398   else
10399     AllowOrderedReductions = TTI->enableOrderedReductions();
10400   if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
10401     ORE->emit([&]() {
10402       auto *ExactFPMathInst = Requirements.getExactFPInst();
10403       return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
10404                                                  ExactFPMathInst->getDebugLoc(),
10405                                                  ExactFPMathInst->getParent())
10406              << "loop not vectorized: cannot prove it is safe to reorder "
10407                 "floating-point operations";
10408     });
10409     LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10410                          "reorder floating-point operations\n");
10411     Hints.emitRemarkWithHints();
10412     return false;
10413   }
10414 
10415   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
10416   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
10417 
10418   // If an override option has been passed in for interleaved accesses, use it.
10419   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
10420     UseInterleaved = EnableInterleavedMemAccesses;
10421 
10422   // Analyze interleaved memory accesses.
10423   if (UseInterleaved) {
10424     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
10425   }
10426 
10427   // Use the cost model.
10428   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10429                                 F, &Hints, IAI);
10430   CM.collectValuesToIgnore();
10431   CM.collectElementTypesForWidening();
10432 
10433   // Use the planner for vectorization.
10434   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints,
10435                                Requirements, ORE);
10436 
10437   // Get user vectorization factor and interleave count.
10438   ElementCount UserVF = Hints.getWidth();
10439   unsigned UserIC = Hints.getInterleave();
10440 
10441   // Plan how to best vectorize, return the best VF and its cost.
10442   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
10443 
10444   VectorizationFactor VF = VectorizationFactor::Disabled();
10445   unsigned IC = 1;
10446 
10447   if (MaybeVF) {
10448     VF = *MaybeVF;
10449     // Select the interleave count.
10450     IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue());
10451   }
10452 
10453   // Identify the diagnostic messages that should be produced.
10454   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10455   bool VectorizeLoop = true, InterleaveLoop = true;
10456   if (VF.Width.isScalar()) {
10457     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10458     VecDiagMsg = std::make_pair(
10459         "VectorizationNotBeneficial",
10460         "the cost-model indicates that vectorization is not beneficial");
10461     VectorizeLoop = false;
10462   }
10463 
10464   if (!MaybeVF && UserIC > 1) {
10465     // Tell the user interleaving was avoided up-front, despite being explicitly
10466     // requested.
10467     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10468                          "interleaving should be avoided up front\n");
10469     IntDiagMsg = std::make_pair(
10470         "InterleavingAvoided",
10471         "Ignoring UserIC, because interleaving was avoided up front");
10472     InterleaveLoop = false;
10473   } else if (IC == 1 && UserIC <= 1) {
10474     // Tell the user interleaving is not beneficial.
10475     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10476     IntDiagMsg = std::make_pair(
10477         "InterleavingNotBeneficial",
10478         "the cost-model indicates that interleaving is not beneficial");
10479     InterleaveLoop = false;
10480     if (UserIC == 1) {
10481       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10482       IntDiagMsg.second +=
10483           " and is explicitly disabled or interleave count is set to 1";
10484     }
10485   } else if (IC > 1 && UserIC == 1) {
10486     // Tell the user interleaving is beneficial, but it explicitly disabled.
10487     LLVM_DEBUG(
10488         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10489     IntDiagMsg = std::make_pair(
10490         "InterleavingBeneficialButDisabled",
10491         "the cost-model indicates that interleaving is beneficial "
10492         "but is explicitly disabled or interleave count is set to 1");
10493     InterleaveLoop = false;
10494   }
10495 
10496   // Override IC if user provided an interleave count.
10497   IC = UserIC > 0 ? UserIC : IC;
10498 
10499   // Emit diagnostic messages, if any.
10500   const char *VAPassName = Hints.vectorizeAnalysisPassName();
10501   if (!VectorizeLoop && !InterleaveLoop) {
10502     // Do not vectorize or interleaving the loop.
10503     ORE->emit([&]() {
10504       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10505                                       L->getStartLoc(), L->getHeader())
10506              << VecDiagMsg.second;
10507     });
10508     ORE->emit([&]() {
10509       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10510                                       L->getStartLoc(), L->getHeader())
10511              << IntDiagMsg.second;
10512     });
10513     return false;
10514   } else if (!VectorizeLoop && InterleaveLoop) {
10515     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10516     ORE->emit([&]() {
10517       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10518                                         L->getStartLoc(), L->getHeader())
10519              << VecDiagMsg.second;
10520     });
10521   } else if (VectorizeLoop && !InterleaveLoop) {
10522     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10523                       << ") in " << DebugLocStr << '\n');
10524     ORE->emit([&]() {
10525       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10526                                         L->getStartLoc(), L->getHeader())
10527              << IntDiagMsg.second;
10528     });
10529   } else if (VectorizeLoop && InterleaveLoop) {
10530     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10531                       << ") in " << DebugLocStr << '\n');
10532     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10533   }
10534 
10535   bool DisableRuntimeUnroll = false;
10536   MDNode *OrigLoopID = L->getLoopID();
10537   {
10538     // Optimistically generate runtime checks. Drop them if they turn out to not
10539     // be profitable. Limit the scope of Checks, so the cleanup happens
10540     // immediately after vector codegeneration is done.
10541     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
10542                              F->getParent()->getDataLayout());
10543     if (!VF.Width.isScalar() || IC > 1)
10544       Checks.Create(L, *LVL.getLAI(), PSE.getPredicate());
10545 
10546     using namespace ore;
10547     if (!VectorizeLoop) {
10548       assert(IC > 1 && "interleave count should not be 1 or 0");
10549       // If we decided that it is not legal to vectorize the loop, then
10550       // interleave it.
10551       InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
10552                                  &CM, BFI, PSI, Checks);
10553 
10554       VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10555       LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT);
10556 
10557       ORE->emit([&]() {
10558         return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10559                                   L->getHeader())
10560                << "interleaved loop (interleaved count: "
10561                << NV("InterleaveCount", IC) << ")";
10562       });
10563     } else {
10564       // If we decided that it is *legal* to vectorize the loop, then do it.
10565 
10566       // Consider vectorizing the epilogue too if it's profitable.
10567       VectorizationFactor EpilogueVF =
10568           CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
10569       if (EpilogueVF.Width.isVector()) {
10570 
10571         // The first pass vectorizes the main loop and creates a scalar epilogue
10572         // to be vectorized by executing the plan (potentially with a different
10573         // factor) again shortly afterwards.
10574         EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
10575         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10576                                            EPI, &LVL, &CM, BFI, PSI, Checks);
10577 
10578         VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
10579         LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV,
10580                         DT);
10581         ++LoopsVectorized;
10582 
10583         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10584         formLCSSARecursively(*L, *DT, LI, SE);
10585 
10586         // Second pass vectorizes the epilogue and adjusts the control flow
10587         // edges from the first pass.
10588         EPI.MainLoopVF = EPI.EpilogueVF;
10589         EPI.MainLoopUF = EPI.EpilogueUF;
10590         EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10591                                                  ORE, EPI, &LVL, &CM, BFI, PSI,
10592                                                  Checks);
10593 
10594         VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
10595 
10596         // Ensure that the start values for any VPReductionPHIRecipes are
10597         // updated before vectorising the epilogue loop.
10598         VPBasicBlock *Header = BestEpiPlan.getEntry()->getEntryBasicBlock();
10599         for (VPRecipeBase &R : Header->phis()) {
10600           if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10601             if (auto *Resume = MainILV.getReductionResumeValue(
10602                     ReductionPhi->getRecurrenceDescriptor())) {
10603               VPValue *StartVal = new VPValue(Resume);
10604               BestEpiPlan.addExternalDef(StartVal);
10605               ReductionPhi->setOperand(0, StartVal);
10606             }
10607           }
10608         }
10609 
10610         LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10611                         DT);
10612         ++LoopsEpilogueVectorized;
10613 
10614         if (!MainILV.areSafetyChecksAdded())
10615           DisableRuntimeUnroll = true;
10616       } else {
10617         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
10618                                &LVL, &CM, BFI, PSI, Checks);
10619 
10620         VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10621         LVP.executePlan(VF.Width, IC, BestPlan, LB, DT);
10622         ++LoopsVectorized;
10623 
10624         // Add metadata to disable runtime unrolling a scalar loop when there
10625         // are no runtime checks about strides and memory. A scalar loop that is
10626         // rarely used is not worth unrolling.
10627         if (!LB.areSafetyChecksAdded())
10628           DisableRuntimeUnroll = true;
10629       }
10630       // Report the vectorization decision.
10631       ORE->emit([&]() {
10632         return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
10633                                   L->getHeader())
10634                << "vectorized loop (vectorization width: "
10635                << NV("VectorizationFactor", VF.Width)
10636                << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
10637       });
10638     }
10639 
10640     if (ORE->allowExtraAnalysis(LV_NAME))
10641       checkMixedPrecision(L, ORE);
10642   }
10643 
10644   Optional<MDNode *> RemainderLoopID =
10645       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
10646                                       LLVMLoopVectorizeFollowupEpilogue});
10647   if (RemainderLoopID.hasValue()) {
10648     L->setLoopID(RemainderLoopID.getValue());
10649   } else {
10650     if (DisableRuntimeUnroll)
10651       AddRuntimeUnrollDisableMetaData(L);
10652 
10653     // Mark the loop as already vectorized to avoid vectorizing again.
10654     Hints.setAlreadyVectorized();
10655   }
10656 
10657   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10658   return true;
10659 }
10660 
10661 LoopVectorizeResult LoopVectorizePass::runImpl(
10662     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
10663     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
10664     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
10665     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
10666     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
10667   SE = &SE_;
10668   LI = &LI_;
10669   TTI = &TTI_;
10670   DT = &DT_;
10671   BFI = &BFI_;
10672   TLI = TLI_;
10673   AA = &AA_;
10674   AC = &AC_;
10675   GetLAA = &GetLAA_;
10676   DB = &DB_;
10677   ORE = &ORE_;
10678   PSI = PSI_;
10679 
10680   // Don't attempt if
10681   // 1. the target claims to have no vector registers, and
10682   // 2. interleaving won't help ILP.
10683   //
10684   // The second condition is necessary because, even if the target has no
10685   // vector registers, loop vectorization may still enable scalar
10686   // interleaving.
10687   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10688       TTI->getMaxInterleaveFactor(1) < 2)
10689     return LoopVectorizeResult(false, false);
10690 
10691   bool Changed = false, CFGChanged = false;
10692 
10693   // The vectorizer requires loops to be in simplified form.
10694   // Since simplification may add new inner loops, it has to run before the
10695   // legality and profitability checks. This means running the loop vectorizer
10696   // will simplify all loops, regardless of whether anything end up being
10697   // vectorized.
10698   for (auto &L : *LI)
10699     Changed |= CFGChanged |=
10700         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10701 
10702   // Build up a worklist of inner-loops to vectorize. This is necessary as
10703   // the act of vectorizing or partially unrolling a loop creates new loops
10704   // and can invalidate iterators across the loops.
10705   SmallVector<Loop *, 8> Worklist;
10706 
10707   for (Loop *L : *LI)
10708     collectSupportedLoops(*L, LI, ORE, Worklist);
10709 
10710   LoopsAnalyzed += Worklist.size();
10711 
10712   // Now walk the identified inner loops.
10713   while (!Worklist.empty()) {
10714     Loop *L = Worklist.pop_back_val();
10715 
10716     // For the inner loops we actually process, form LCSSA to simplify the
10717     // transform.
10718     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10719 
10720     Changed |= CFGChanged |= processLoop(L);
10721   }
10722 
10723   // Process each loop nest in the function.
10724   return LoopVectorizeResult(Changed, CFGChanged);
10725 }
10726 
10727 PreservedAnalyses LoopVectorizePass::run(Function &F,
10728                                          FunctionAnalysisManager &AM) {
10729     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
10730     auto &LI = AM.getResult<LoopAnalysis>(F);
10731     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
10732     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
10733     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
10734     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
10735     auto &AA = AM.getResult<AAManager>(F);
10736     auto &AC = AM.getResult<AssumptionAnalysis>(F);
10737     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
10738     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
10739 
10740     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
10741     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
10742         [&](Loop &L) -> const LoopAccessInfo & {
10743       LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,      SE,
10744                                         TLI, TTI, nullptr, nullptr, nullptr};
10745       return LAM.getResult<LoopAccessAnalysis>(L, AR);
10746     };
10747     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10748     ProfileSummaryInfo *PSI =
10749         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10750     LoopVectorizeResult Result =
10751         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
10752     if (!Result.MadeAnyChange)
10753       return PreservedAnalyses::all();
10754     PreservedAnalyses PA;
10755 
10756     // We currently do not preserve loopinfo/dominator analyses with outer loop
10757     // vectorization. Until this is addressed, mark these analyses as preserved
10758     // only for non-VPlan-native path.
10759     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
10760     if (!EnableVPlanNativePath) {
10761       PA.preserve<LoopAnalysis>();
10762       PA.preserve<DominatorTreeAnalysis>();
10763     }
10764 
10765     if (Result.MadeCFGChange) {
10766       // Making CFG changes likely means a loop got vectorized. Indicate that
10767       // extra simplification passes should be run.
10768       // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10769       // be run if runtime checks have been added.
10770       AM.getResult<ShouldRunExtraVectorPasses>(F);
10771       PA.preserve<ShouldRunExtraVectorPasses>();
10772     } else {
10773       PA.preserveSet<CFGAnalyses>();
10774     }
10775     return PA;
10776 }
10777 
10778 void LoopVectorizePass::printPipeline(
10779     raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10780   static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10781       OS, MapClassName2PassName);
10782 
10783   OS << "<";
10784   OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10785   OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10786   OS << ">";
10787 }
10788