1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanTransforms.h"
62 #include "llvm/ADT/APInt.h"
63 #include "llvm/ADT/ArrayRef.h"
64 #include "llvm/ADT/DenseMap.h"
65 #include "llvm/ADT/DenseMapInfo.h"
66 #include "llvm/ADT/Hashing.h"
67 #include "llvm/ADT/MapVector.h"
68 #include "llvm/ADT/None.h"
69 #include "llvm/ADT/Optional.h"
70 #include "llvm/ADT/STLExtras.h"
71 #include "llvm/ADT/SmallPtrSet.h"
72 #include "llvm/ADT/SmallSet.h"
73 #include "llvm/ADT/SmallVector.h"
74 #include "llvm/ADT/Statistic.h"
75 #include "llvm/ADT/StringRef.h"
76 #include "llvm/ADT/Twine.h"
77 #include "llvm/ADT/iterator_range.h"
78 #include "llvm/Analysis/AssumptionCache.h"
79 #include "llvm/Analysis/BasicAliasAnalysis.h"
80 #include "llvm/Analysis/BlockFrequencyInfo.h"
81 #include "llvm/Analysis/CFG.h"
82 #include "llvm/Analysis/CodeMetrics.h"
83 #include "llvm/Analysis/DemandedBits.h"
84 #include "llvm/Analysis/GlobalsModRef.h"
85 #include "llvm/Analysis/LoopAccessAnalysis.h"
86 #include "llvm/Analysis/LoopAnalysisManager.h"
87 #include "llvm/Analysis/LoopInfo.h"
88 #include "llvm/Analysis/LoopIterator.h"
89 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
90 #include "llvm/Analysis/ProfileSummaryInfo.h"
91 #include "llvm/Analysis/ScalarEvolution.h"
92 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
93 #include "llvm/Analysis/TargetLibraryInfo.h"
94 #include "llvm/Analysis/TargetTransformInfo.h"
95 #include "llvm/Analysis/VectorUtils.h"
96 #include "llvm/IR/Attributes.h"
97 #include "llvm/IR/BasicBlock.h"
98 #include "llvm/IR/CFG.h"
99 #include "llvm/IR/Constant.h"
100 #include "llvm/IR/Constants.h"
101 #include "llvm/IR/DataLayout.h"
102 #include "llvm/IR/DebugInfoMetadata.h"
103 #include "llvm/IR/DebugLoc.h"
104 #include "llvm/IR/DerivedTypes.h"
105 #include "llvm/IR/DiagnosticInfo.h"
106 #include "llvm/IR/Dominators.h"
107 #include "llvm/IR/Function.h"
108 #include "llvm/IR/IRBuilder.h"
109 #include "llvm/IR/InstrTypes.h"
110 #include "llvm/IR/Instruction.h"
111 #include "llvm/IR/Instructions.h"
112 #include "llvm/IR/IntrinsicInst.h"
113 #include "llvm/IR/Intrinsics.h"
114 #include "llvm/IR/Metadata.h"
115 #include "llvm/IR/Module.h"
116 #include "llvm/IR/Operator.h"
117 #include "llvm/IR/PatternMatch.h"
118 #include "llvm/IR/Type.h"
119 #include "llvm/IR/Use.h"
120 #include "llvm/IR/User.h"
121 #include "llvm/IR/Value.h"
122 #include "llvm/IR/ValueHandle.h"
123 #include "llvm/IR/Verifier.h"
124 #include "llvm/InitializePasses.h"
125 #include "llvm/Pass.h"
126 #include "llvm/Support/Casting.h"
127 #include "llvm/Support/CommandLine.h"
128 #include "llvm/Support/Compiler.h"
129 #include "llvm/Support/Debug.h"
130 #include "llvm/Support/ErrorHandling.h"
131 #include "llvm/Support/InstructionCost.h"
132 #include "llvm/Support/MathExtras.h"
133 #include "llvm/Support/raw_ostream.h"
134 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
135 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
136 #include "llvm/Transforms/Utils/LoopSimplify.h"
137 #include "llvm/Transforms/Utils/LoopUtils.h"
138 #include "llvm/Transforms/Utils/LoopVersioning.h"
139 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
140 #include "llvm/Transforms/Utils/SizeOpts.h"
141 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
142 #include <algorithm>
143 #include <cassert>
144 #include <cstdint>
145 #include <functional>
146 #include <iterator>
147 #include <limits>
148 #include <map>
149 #include <memory>
150 #include <string>
151 #include <tuple>
152 #include <utility>
153 
154 using namespace llvm;
155 
156 #define LV_NAME "loop-vectorize"
157 #define DEBUG_TYPE LV_NAME
158 
159 #ifndef NDEBUG
160 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
161 #endif
162 
163 /// @{
164 /// Metadata attribute names
165 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
166 const char LLVMLoopVectorizeFollowupVectorized[] =
167     "llvm.loop.vectorize.followup_vectorized";
168 const char LLVMLoopVectorizeFollowupEpilogue[] =
169     "llvm.loop.vectorize.followup_epilogue";
170 /// @}
171 
172 STATISTIC(LoopsVectorized, "Number of loops vectorized");
173 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
174 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
175 
176 static cl::opt<bool> EnableEpilogueVectorization(
177     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
178     cl::desc("Enable vectorization of epilogue loops."));
179 
180 static cl::opt<unsigned> EpilogueVectorizationForceVF(
181     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
182     cl::desc("When epilogue vectorization is enabled, and a value greater than "
183              "1 is specified, forces the given VF for all applicable epilogue "
184              "loops."));
185 
186 static cl::opt<unsigned> EpilogueVectorizationMinVF(
187     "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
188     cl::desc("Only loops with vectorization factor equal to or larger than "
189              "the specified value are considered for epilogue vectorization."));
190 
191 /// Loops with a known constant trip count below this number are vectorized only
192 /// if no scalar iteration overheads are incurred.
193 static cl::opt<unsigned> TinyTripCountVectorThreshold(
194     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
195     cl::desc("Loops with a constant trip count that is smaller than this "
196              "value are vectorized only if no scalar iteration overheads "
197              "are incurred."));
198 
199 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
200     "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
201     cl::desc("The maximum allowed number of runtime memory checks with a "
202              "vectorize(enable) pragma."));
203 
204 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
205 // that predication is preferred, and this lists all options. I.e., the
206 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
207 // and predicate the instructions accordingly. If tail-folding fails, there are
208 // different fallback strategies depending on these values:
209 namespace PreferPredicateTy {
210   enum Option {
211     ScalarEpilogue = 0,
212     PredicateElseScalarEpilogue,
213     PredicateOrDontVectorize
214   };
215 } // namespace PreferPredicateTy
216 
217 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
218     "prefer-predicate-over-epilogue",
219     cl::init(PreferPredicateTy::ScalarEpilogue),
220     cl::Hidden,
221     cl::desc("Tail-folding and predication preferences over creating a scalar "
222              "epilogue loop."),
223     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
224                          "scalar-epilogue",
225                          "Don't tail-predicate loops, create scalar epilogue"),
226               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
227                          "predicate-else-scalar-epilogue",
228                          "prefer tail-folding, create scalar epilogue if tail "
229                          "folding fails."),
230               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
231                          "predicate-dont-vectorize",
232                          "prefers tail-folding, don't attempt vectorization if "
233                          "tail-folding fails.")));
234 
235 static cl::opt<bool> MaximizeBandwidth(
236     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
237     cl::desc("Maximize bandwidth when selecting vectorization factor which "
238              "will be determined by the smallest type in loop."));
239 
240 static cl::opt<bool> EnableInterleavedMemAccesses(
241     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
242     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
243 
244 /// An interleave-group may need masking if it resides in a block that needs
245 /// predication, or in order to mask away gaps.
246 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
247     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
248     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
249 
250 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
251     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
252     cl::desc("We don't interleave loops with a estimated constant trip count "
253              "below this number"));
254 
255 static cl::opt<unsigned> ForceTargetNumScalarRegs(
256     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
257     cl::desc("A flag that overrides the target's number of scalar registers."));
258 
259 static cl::opt<unsigned> ForceTargetNumVectorRegs(
260     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
261     cl::desc("A flag that overrides the target's number of vector registers."));
262 
263 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
264     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
265     cl::desc("A flag that overrides the target's max interleave factor for "
266              "scalar loops."));
267 
268 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
269     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
270     cl::desc("A flag that overrides the target's max interleave factor for "
271              "vectorized loops."));
272 
273 static cl::opt<unsigned> ForceTargetInstructionCost(
274     "force-target-instruction-cost", cl::init(0), cl::Hidden,
275     cl::desc("A flag that overrides the target's expected cost for "
276              "an instruction to a single constant value. Mostly "
277              "useful for getting consistent testing."));
278 
279 static cl::opt<bool> ForceTargetSupportsScalableVectors(
280     "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
281     cl::desc(
282         "Pretend that scalable vectors are supported, even if the target does "
283         "not support them. This flag should only be used for testing."));
284 
285 static cl::opt<unsigned> SmallLoopCost(
286     "small-loop-cost", cl::init(20), cl::Hidden,
287     cl::desc(
288         "The cost of a loop that is considered 'small' by the interleaver."));
289 
290 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
291     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
292     cl::desc("Enable the use of the block frequency analysis to access PGO "
293              "heuristics minimizing code growth in cold regions and being more "
294              "aggressive in hot regions."));
295 
296 // Runtime interleave loops for load/store throughput.
297 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
298     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
299     cl::desc(
300         "Enable runtime interleaving until load/store ports are saturated"));
301 
302 /// Interleave small loops with scalar reductions.
303 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
304     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
305     cl::desc("Enable interleaving for loops with small iteration counts that "
306              "contain scalar reductions to expose ILP."));
307 
308 /// The number of stores in a loop that are allowed to need predication.
309 static cl::opt<unsigned> NumberOfStoresToPredicate(
310     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
311     cl::desc("Max number of stores to be predicated behind an if."));
312 
313 static cl::opt<bool> EnableIndVarRegisterHeur(
314     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
315     cl::desc("Count the induction variable only once when interleaving"));
316 
317 static cl::opt<bool> EnableCondStoresVectorization(
318     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
319     cl::desc("Enable if predication of stores during vectorization."));
320 
321 static cl::opt<unsigned> MaxNestedScalarReductionIC(
322     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
323     cl::desc("The maximum interleave count to use when interleaving a scalar "
324              "reduction in a nested loop."));
325 
326 static cl::opt<bool>
327     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
328                            cl::Hidden,
329                            cl::desc("Prefer in-loop vector reductions, "
330                                     "overriding the targets preference."));
331 
332 static cl::opt<bool> ForceOrderedReductions(
333     "force-ordered-reductions", cl::init(false), cl::Hidden,
334     cl::desc("Enable the vectorisation of loops with in-order (strict) "
335              "FP reductions"));
336 
337 static cl::opt<bool> PreferPredicatedReductionSelect(
338     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
339     cl::desc(
340         "Prefer predicating a reduction operation over an after loop select."));
341 
342 cl::opt<bool> EnableVPlanNativePath(
343     "enable-vplan-native-path", cl::init(false), cl::Hidden,
344     cl::desc("Enable VPlan-native vectorization path with "
345              "support for outer loop vectorization."));
346 
347 // This flag enables the stress testing of the VPlan H-CFG construction in the
348 // VPlan-native vectorization path. It must be used in conjuction with
349 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
350 // verification of the H-CFGs built.
351 static cl::opt<bool> VPlanBuildStressTest(
352     "vplan-build-stress-test", cl::init(false), cl::Hidden,
353     cl::desc(
354         "Build VPlan for every supported loop nest in the function and bail "
355         "out right after the build (stress test the VPlan H-CFG construction "
356         "in the VPlan-native vectorization path)."));
357 
358 cl::opt<bool> llvm::EnableLoopInterleaving(
359     "interleave-loops", cl::init(true), cl::Hidden,
360     cl::desc("Enable loop interleaving in Loop vectorization passes"));
361 cl::opt<bool> llvm::EnableLoopVectorization(
362     "vectorize-loops", cl::init(true), cl::Hidden,
363     cl::desc("Run the Loop vectorization passes"));
364 
365 cl::opt<bool> PrintVPlansInDotFormat(
366     "vplan-print-in-dot-format", cl::init(false), cl::Hidden,
367     cl::desc("Use dot format instead of plain text when dumping VPlans"));
368 
369 /// A helper function that returns true if the given type is irregular. The
370 /// type is irregular if its allocated size doesn't equal the store size of an
371 /// element of the corresponding vector type.
372 static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
373   // Determine if an array of N elements of type Ty is "bitcast compatible"
374   // with a <N x Ty> vector.
375   // This is only true if there is no padding between the array elements.
376   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
377 }
378 
379 /// A helper function that returns the reciprocal of the block probability of
380 /// predicated blocks. If we return X, we are assuming the predicated block
381 /// will execute once for every X iterations of the loop header.
382 ///
383 /// TODO: We should use actual block probability here, if available. Currently,
384 ///       we always assume predicated blocks have a 50% chance of executing.
385 static unsigned getReciprocalPredBlockProb() { return 2; }
386 
387 /// A helper function that returns an integer or floating-point constant with
388 /// value C.
389 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
390   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
391                            : ConstantFP::get(Ty, C);
392 }
393 
394 /// Returns "best known" trip count for the specified loop \p L as defined by
395 /// the following procedure:
396 ///   1) Returns exact trip count if it is known.
397 ///   2) Returns expected trip count according to profile data if any.
398 ///   3) Returns upper bound estimate if it is known.
399 ///   4) Returns None if all of the above failed.
400 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
401   // Check if exact trip count is known.
402   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
403     return ExpectedTC;
404 
405   // Check if there is an expected trip count available from profile data.
406   if (LoopVectorizeWithBlockFrequency)
407     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
408       return EstimatedTC;
409 
410   // Check if upper bound estimate is known.
411   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
412     return ExpectedTC;
413 
414   return None;
415 }
416 
417 // Forward declare GeneratedRTChecks.
418 class GeneratedRTChecks;
419 
420 namespace llvm {
421 
422 AnalysisKey ShouldRunExtraVectorPasses::Key;
423 
424 /// InnerLoopVectorizer vectorizes loops which contain only one basic
425 /// block to a specified vectorization factor (VF).
426 /// This class performs the widening of scalars into vectors, or multiple
427 /// scalars. This class also implements the following features:
428 /// * It inserts an epilogue loop for handling loops that don't have iteration
429 ///   counts that are known to be a multiple of the vectorization factor.
430 /// * It handles the code generation for reduction variables.
431 /// * Scalarization (implementation using scalars) of un-vectorizable
432 ///   instructions.
433 /// InnerLoopVectorizer does not perform any vectorization-legality
434 /// checks, and relies on the caller to check for the different legality
435 /// aspects. The InnerLoopVectorizer relies on the
436 /// LoopVectorizationLegality class to provide information about the induction
437 /// and reduction variables that were found to a given vectorization factor.
438 class InnerLoopVectorizer {
439 public:
440   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
441                       LoopInfo *LI, DominatorTree *DT,
442                       const TargetLibraryInfo *TLI,
443                       const TargetTransformInfo *TTI, AssumptionCache *AC,
444                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
445                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
446                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
447                       ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
448       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
449         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
450         Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
451         PSI(PSI), RTChecks(RTChecks) {
452     // Query this against the original loop and save it here because the profile
453     // of the original loop header may change as the transformation happens.
454     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
455         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
456   }
457 
458   virtual ~InnerLoopVectorizer() = default;
459 
460   /// Create a new empty loop that will contain vectorized instructions later
461   /// on, while the old loop will be used as the scalar remainder. Control flow
462   /// is generated around the vectorized (and scalar epilogue) loops consisting
463   /// of various checks and bypasses. Return the pre-header block of the new
464   /// loop and the start value for the canonical induction, if it is != 0. The
465   /// latter is the case when vectorizing the epilogue loop. In the case of
466   /// epilogue vectorization, this function is overriden to handle the more
467   /// complex control flow around the loops.
468   virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton();
469 
470   /// Widen a single call instruction within the innermost loop.
471   void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
472                             VPTransformState &State);
473 
474   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
475   void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
476 
477   // Return true if any runtime check is added.
478   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
479 
480   /// A type for vectorized values in the new loop. Each value from the
481   /// original loop, when vectorized, is represented by UF vector values in the
482   /// new unrolled loop, where UF is the unroll factor.
483   using VectorParts = SmallVector<Value *, 2>;
484 
485   /// Vectorize a single vector PHINode in a block in the VPlan-native path
486   /// only.
487   void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR,
488                            VPTransformState &State);
489 
490   /// A helper function to scalarize a single Instruction in the innermost loop.
491   /// Generates a sequence of scalar instances for each lane between \p MinLane
492   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
493   /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
494   /// Instr's operands.
495   void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe,
496                             const VPIteration &Instance, bool IfPredicateInstr,
497                             VPTransformState &State);
498 
499   /// Construct the vector value of a scalarized value \p V one lane at a time.
500   void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
501                                  VPTransformState &State);
502 
503   /// Try to vectorize interleaved access group \p Group with the base address
504   /// given in \p Addr, optionally masking the vector operations if \p
505   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
506   /// values in the vectorized loop.
507   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
508                                 ArrayRef<VPValue *> VPDefs,
509                                 VPTransformState &State, VPValue *Addr,
510                                 ArrayRef<VPValue *> StoredValues,
511                                 VPValue *BlockInMask = nullptr);
512 
513   /// Set the debug location in the builder \p Ptr using the debug location in
514   /// \p V. If \p Ptr is None then it uses the class member's Builder.
515   void setDebugLocFromInst(const Value *V);
516 
517   /// Fix the non-induction PHIs in \p Plan.
518   void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);
519 
520   /// Returns true if the reordering of FP operations is not allowed, but we are
521   /// able to vectorize with strict in-order reductions for the given RdxDesc.
522   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
523 
524   /// Create a broadcast instruction. This method generates a broadcast
525   /// instruction (shuffle) for loop invariant values and for the induction
526   /// value. If this is the induction variable then we extend it to N, N+1, ...
527   /// this is needed because each iteration in the loop corresponds to a SIMD
528   /// element.
529   virtual Value *getBroadcastInstrs(Value *V);
530 
531   /// Add metadata from one instruction to another.
532   ///
533   /// This includes both the original MDs from \p From and additional ones (\see
534   /// addNewMetadata).  Use this for *newly created* instructions in the vector
535   /// loop.
536   void addMetadata(Instruction *To, Instruction *From);
537 
538   /// Similar to the previous function but it adds the metadata to a
539   /// vector of instructions.
540   void addMetadata(ArrayRef<Value *> To, Instruction *From);
541 
542   // Returns the resume value (bc.merge.rdx) for a reduction as
543   // generated by fixReduction.
544   PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc);
545 
546 protected:
547   friend class LoopVectorizationPlanner;
548 
549   /// A small list of PHINodes.
550   using PhiVector = SmallVector<PHINode *, 4>;
551 
552   /// A type for scalarized values in the new loop. Each value from the
553   /// original loop, when scalarized, is represented by UF x VF scalar values
554   /// in the new unrolled loop, where UF is the unroll factor and VF is the
555   /// vectorization factor.
556   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
557 
558   /// Set up the values of the IVs correctly when exiting the vector loop.
559   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
560                     Value *VectorTripCount, Value *EndValue,
561                     BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
562                     VPlan &Plan);
563 
564   /// Handle all cross-iteration phis in the header.
565   void fixCrossIterationPHIs(VPTransformState &State);
566 
567   /// Create the exit value of first order recurrences in the middle block and
568   /// update their users.
569   void fixFirstOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
570                                VPTransformState &State);
571 
572   /// Create code for the loop exit value of the reduction.
573   void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
574 
575   /// Clear NSW/NUW flags from reduction instructions if necessary.
576   void clearReductionWrapFlags(VPReductionPHIRecipe *PhiR,
577                                VPTransformState &State);
578 
579   /// Iteratively sink the scalarized operands of a predicated instruction into
580   /// the block that was created for it.
581   void sinkScalarOperands(Instruction *PredInst);
582 
583   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
584   /// represented as.
585   void truncateToMinimalBitwidths(VPTransformState &State);
586 
587   /// Returns (and creates if needed) the original loop trip count.
588   Value *getOrCreateTripCount(BasicBlock *InsertBlock);
589 
590   /// Returns (and creates if needed) the trip count of the widened loop.
591   Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
592 
593   /// Returns a bitcasted value to the requested vector type.
594   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
595   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
596                                 const DataLayout &DL);
597 
598   /// Emit a bypass check to see if the vector trip count is zero, including if
599   /// it overflows.
600   void emitIterationCountCheck(BasicBlock *Bypass);
601 
602   /// Emit a bypass check to see if all of the SCEV assumptions we've
603   /// had to make are correct. Returns the block containing the checks or
604   /// nullptr if no checks have been added.
605   BasicBlock *emitSCEVChecks(BasicBlock *Bypass);
606 
607   /// Emit bypass checks to check any memory assumptions we may have made.
608   /// Returns the block containing the checks or nullptr if no checks have been
609   /// added.
610   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass);
611 
612   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
613   /// vector loop preheader, middle block and scalar preheader.
614   void createVectorLoopSkeleton(StringRef Prefix);
615 
616   /// Create new phi nodes for the induction variables to resume iteration count
617   /// in the scalar epilogue, from where the vectorized loop left off.
618   /// In cases where the loop skeleton is more complicated (eg. epilogue
619   /// vectorization) and the resume values can come from an additional bypass
620   /// block, the \p AdditionalBypass pair provides information about the bypass
621   /// block and the end value on the edge from bypass to this loop.
622   void createInductionResumeValues(
623       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
624 
625   /// Complete the loop skeleton by adding debug MDs, creating appropriate
626   /// conditional branches in the middle block, preparing the builder and
627   /// running the verifier. Return the preheader of the completed vector loop.
628   BasicBlock *completeLoopSkeleton(MDNode *OrigLoopID);
629 
630   /// Add additional metadata to \p To that was not present on \p Orig.
631   ///
632   /// Currently this is used to add the noalias annotations based on the
633   /// inserted memchecks.  Use this for instructions that are *cloned* into the
634   /// vector loop.
635   void addNewMetadata(Instruction *To, const Instruction *Orig);
636 
637   /// Collect poison-generating recipes that may generate a poison value that is
638   /// used after vectorization, even when their operands are not poison. Those
639   /// recipes meet the following conditions:
640   ///  * Contribute to the address computation of a recipe generating a widen
641   ///    memory load/store (VPWidenMemoryInstructionRecipe or
642   ///    VPInterleaveRecipe).
643   ///  * Such a widen memory load/store has at least one underlying Instruction
644   ///    that is in a basic block that needs predication and after vectorization
645   ///    the generated instruction won't be predicated.
646   void collectPoisonGeneratingRecipes(VPTransformState &State);
647 
648   /// Allow subclasses to override and print debug traces before/after vplan
649   /// execution, when trace information is requested.
650   virtual void printDebugTracesAtStart(){};
651   virtual void printDebugTracesAtEnd(){};
652 
653   /// The original loop.
654   Loop *OrigLoop;
655 
656   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
657   /// dynamic knowledge to simplify SCEV expressions and converts them to a
658   /// more usable form.
659   PredicatedScalarEvolution &PSE;
660 
661   /// Loop Info.
662   LoopInfo *LI;
663 
664   /// Dominator Tree.
665   DominatorTree *DT;
666 
667   /// Alias Analysis.
668   AAResults *AA;
669 
670   /// Target Library Info.
671   const TargetLibraryInfo *TLI;
672 
673   /// Target Transform Info.
674   const TargetTransformInfo *TTI;
675 
676   /// Assumption Cache.
677   AssumptionCache *AC;
678 
679   /// Interface to emit optimization remarks.
680   OptimizationRemarkEmitter *ORE;
681 
682   /// LoopVersioning.  It's only set up (non-null) if memchecks were
683   /// used.
684   ///
685   /// This is currently only used to add no-alias metadata based on the
686   /// memchecks.  The actually versioning is performed manually.
687   std::unique_ptr<LoopVersioning> LVer;
688 
689   /// The vectorization SIMD factor to use. Each vector will have this many
690   /// vector elements.
691   ElementCount VF;
692 
693   /// The vectorization unroll factor to use. Each scalar is vectorized to this
694   /// many different vector instructions.
695   unsigned UF;
696 
697   /// The builder that we use
698   IRBuilder<> Builder;
699 
700   // --- Vectorization state ---
701 
702   /// The vector-loop preheader.
703   BasicBlock *LoopVectorPreHeader;
704 
705   /// The scalar-loop preheader.
706   BasicBlock *LoopScalarPreHeader;
707 
708   /// Middle Block between the vector and the scalar.
709   BasicBlock *LoopMiddleBlock;
710 
711   /// The unique ExitBlock of the scalar loop if one exists.  Note that
712   /// there can be multiple exiting edges reaching this block.
713   BasicBlock *LoopExitBlock;
714 
715   /// The scalar loop body.
716   BasicBlock *LoopScalarBody;
717 
718   /// A list of all bypass blocks. The first block is the entry of the loop.
719   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
720 
721   /// Store instructions that were predicated.
722   SmallVector<Instruction *, 4> PredicatedInstructions;
723 
724   /// Trip count of the original loop.
725   Value *TripCount = nullptr;
726 
727   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
728   Value *VectorTripCount = nullptr;
729 
730   /// The legality analysis.
731   LoopVectorizationLegality *Legal;
732 
733   /// The profitablity analysis.
734   LoopVectorizationCostModel *Cost;
735 
736   // Record whether runtime checks are added.
737   bool AddedSafetyChecks = false;
738 
739   // Holds the end values for each induction variable. We save the end values
740   // so we can later fix-up the external users of the induction variables.
741   DenseMap<PHINode *, Value *> IVEndValues;
742 
743   /// BFI and PSI are used to check for profile guided size optimizations.
744   BlockFrequencyInfo *BFI;
745   ProfileSummaryInfo *PSI;
746 
747   // Whether this loop should be optimized for size based on profile guided size
748   // optimizatios.
749   bool OptForSizeBasedOnProfile;
750 
751   /// Structure to hold information about generated runtime checks, responsible
752   /// for cleaning the checks, if vectorization turns out unprofitable.
753   GeneratedRTChecks &RTChecks;
754 
755   // Holds the resume values for reductions in the loops, used to set the
756   // correct start value of reduction PHIs when vectorizing the epilogue.
757   SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4>
758       ReductionResumeValues;
759 };
760 
761 class InnerLoopUnroller : public InnerLoopVectorizer {
762 public:
763   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
764                     LoopInfo *LI, DominatorTree *DT,
765                     const TargetLibraryInfo *TLI,
766                     const TargetTransformInfo *TTI, AssumptionCache *AC,
767                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
768                     LoopVectorizationLegality *LVL,
769                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
770                     ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
771       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
772                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
773                             BFI, PSI, Check) {}
774 
775 private:
776   Value *getBroadcastInstrs(Value *V) override;
777 };
778 
779 /// Encapsulate information regarding vectorization of a loop and its epilogue.
780 /// This information is meant to be updated and used across two stages of
781 /// epilogue vectorization.
782 struct EpilogueLoopVectorizationInfo {
783   ElementCount MainLoopVF = ElementCount::getFixed(0);
784   unsigned MainLoopUF = 0;
785   ElementCount EpilogueVF = ElementCount::getFixed(0);
786   unsigned EpilogueUF = 0;
787   BasicBlock *MainLoopIterationCountCheck = nullptr;
788   BasicBlock *EpilogueIterationCountCheck = nullptr;
789   BasicBlock *SCEVSafetyCheck = nullptr;
790   BasicBlock *MemSafetyCheck = nullptr;
791   Value *TripCount = nullptr;
792   Value *VectorTripCount = nullptr;
793 
794   EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
795                                 ElementCount EVF, unsigned EUF)
796       : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
797     assert(EUF == 1 &&
798            "A high UF for the epilogue loop is likely not beneficial.");
799   }
800 };
801 
802 /// An extension of the inner loop vectorizer that creates a skeleton for a
803 /// vectorized loop that has its epilogue (residual) also vectorized.
804 /// The idea is to run the vplan on a given loop twice, firstly to setup the
805 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
806 /// from the first step and vectorize the epilogue.  This is achieved by
807 /// deriving two concrete strategy classes from this base class and invoking
808 /// them in succession from the loop vectorizer planner.
809 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
810 public:
811   InnerLoopAndEpilogueVectorizer(
812       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
813       DominatorTree *DT, const TargetLibraryInfo *TLI,
814       const TargetTransformInfo *TTI, AssumptionCache *AC,
815       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
816       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
817       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
818       GeneratedRTChecks &Checks)
819       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
820                             EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI,
821                             Checks),
822         EPI(EPI) {}
823 
824   // Override this function to handle the more complex control flow around the
825   // three loops.
826   std::pair<BasicBlock *, Value *>
827   createVectorizedLoopSkeleton() final override {
828     return createEpilogueVectorizedLoopSkeleton();
829   }
830 
831   /// The interface for creating a vectorized skeleton using one of two
832   /// different strategies, each corresponding to one execution of the vplan
833   /// as described above.
834   virtual std::pair<BasicBlock *, Value *>
835   createEpilogueVectorizedLoopSkeleton() = 0;
836 
837   /// Holds and updates state information required to vectorize the main loop
838   /// and its epilogue in two separate passes. This setup helps us avoid
839   /// regenerating and recomputing runtime safety checks. It also helps us to
840   /// shorten the iteration-count-check path length for the cases where the
841   /// iteration count of the loop is so small that the main vector loop is
842   /// completely skipped.
843   EpilogueLoopVectorizationInfo &EPI;
844 };
845 
846 /// A specialized derived class of inner loop vectorizer that performs
847 /// vectorization of *main* loops in the process of vectorizing loops and their
848 /// epilogues.
849 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
850 public:
851   EpilogueVectorizerMainLoop(
852       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
853       DominatorTree *DT, const TargetLibraryInfo *TLI,
854       const TargetTransformInfo *TTI, AssumptionCache *AC,
855       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
856       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
857       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
858       GeneratedRTChecks &Check)
859       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
860                                        EPI, LVL, CM, BFI, PSI, Check) {}
861   /// Implements the interface for creating a vectorized skeleton using the
862   /// *main loop* strategy (ie the first pass of vplan execution).
863   std::pair<BasicBlock *, Value *>
864   createEpilogueVectorizedLoopSkeleton() final override;
865 
866 protected:
867   /// Emits an iteration count bypass check once for the main loop (when \p
868   /// ForEpilogue is false) and once for the epilogue loop (when \p
869   /// ForEpilogue is true).
870   BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
871   void printDebugTracesAtStart() override;
872   void printDebugTracesAtEnd() override;
873 };
874 
875 // A specialized derived class of inner loop vectorizer that performs
876 // vectorization of *epilogue* loops in the process of vectorizing loops and
877 // their epilogues.
878 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
879 public:
880   EpilogueVectorizerEpilogueLoop(
881       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
882       DominatorTree *DT, const TargetLibraryInfo *TLI,
883       const TargetTransformInfo *TTI, AssumptionCache *AC,
884       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
885       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
886       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
887       GeneratedRTChecks &Checks)
888       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
889                                        EPI, LVL, CM, BFI, PSI, Checks) {
890     TripCount = EPI.TripCount;
891   }
892   /// Implements the interface for creating a vectorized skeleton using the
893   /// *epilogue loop* strategy (ie the second pass of vplan execution).
894   std::pair<BasicBlock *, Value *>
895   createEpilogueVectorizedLoopSkeleton() final override;
896 
897 protected:
898   /// Emits an iteration count bypass check after the main vector loop has
899   /// finished to see if there are any iterations left to execute by either
900   /// the vector epilogue or the scalar epilogue.
901   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
902                                                       BasicBlock *Bypass,
903                                                       BasicBlock *Insert);
904   void printDebugTracesAtStart() override;
905   void printDebugTracesAtEnd() override;
906 };
907 } // end namespace llvm
908 
909 /// Look for a meaningful debug location on the instruction or it's
910 /// operands.
911 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
912   if (!I)
913     return I;
914 
915   DebugLoc Empty;
916   if (I->getDebugLoc() != Empty)
917     return I;
918 
919   for (Use &Op : I->operands()) {
920     if (Instruction *OpInst = dyn_cast<Instruction>(Op))
921       if (OpInst->getDebugLoc() != Empty)
922         return OpInst;
923   }
924 
925   return I;
926 }
927 
928 void InnerLoopVectorizer::setDebugLocFromInst(
929     const Value *V) {
930   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) {
931     const DILocation *DIL = Inst->getDebugLoc();
932 
933     // When a FSDiscriminator is enabled, we don't need to add the multiply
934     // factors to the discriminators.
935     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
936         !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) {
937       // FIXME: For scalable vectors, assume vscale=1.
938       auto NewDIL =
939           DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
940       if (NewDIL)
941         Builder.SetCurrentDebugLocation(NewDIL.getValue());
942       else
943         LLVM_DEBUG(dbgs()
944                    << "Failed to create new discriminator: "
945                    << DIL->getFilename() << " Line: " << DIL->getLine());
946     } else
947       Builder.SetCurrentDebugLocation(DIL);
948   } else
949     Builder.SetCurrentDebugLocation(DebugLoc());
950 }
951 
952 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
953 /// is passed, the message relates to that particular instruction.
954 #ifndef NDEBUG
955 static void debugVectorizationMessage(const StringRef Prefix,
956                                       const StringRef DebugMsg,
957                                       Instruction *I) {
958   dbgs() << "LV: " << Prefix << DebugMsg;
959   if (I != nullptr)
960     dbgs() << " " << *I;
961   else
962     dbgs() << '.';
963   dbgs() << '\n';
964 }
965 #endif
966 
967 /// Create an analysis remark that explains why vectorization failed
968 ///
969 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
970 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
971 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
972 /// the location of the remark.  \return the remark object that can be
973 /// streamed to.
974 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
975     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
976   Value *CodeRegion = TheLoop->getHeader();
977   DebugLoc DL = TheLoop->getStartLoc();
978 
979   if (I) {
980     CodeRegion = I->getParent();
981     // If there is no debug location attached to the instruction, revert back to
982     // using the loop's.
983     if (I->getDebugLoc())
984       DL = I->getDebugLoc();
985   }
986 
987   return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
988 }
989 
990 namespace llvm {
991 
992 /// Return a value for Step multiplied by VF.
993 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
994                        int64_t Step) {
995   assert(Ty->isIntegerTy() && "Expected an integer step");
996   Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue());
997   return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
998 }
999 
1000 /// Return the runtime value for VF.
1001 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
1002   Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
1003   return VF.isScalable() ? B.CreateVScale(EC) : EC;
1004 }
1005 
1006 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
1007                                   ElementCount VF) {
1008   assert(FTy->isFloatingPointTy() && "Expected floating point type!");
1009   Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
1010   Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
1011   return B.CreateUIToFP(RuntimeVF, FTy);
1012 }
1013 
1014 void reportVectorizationFailure(const StringRef DebugMsg,
1015                                 const StringRef OREMsg, const StringRef ORETag,
1016                                 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1017                                 Instruction *I) {
1018   LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
1019   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1020   ORE->emit(
1021       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1022       << "loop not vectorized: " << OREMsg);
1023 }
1024 
1025 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1026                              OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1027                              Instruction *I) {
1028   LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
1029   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1030   ORE->emit(
1031       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1032       << Msg);
1033 }
1034 
1035 } // end namespace llvm
1036 
1037 #ifndef NDEBUG
1038 /// \return string containing a file name and a line # for the given loop.
1039 static std::string getDebugLocString(const Loop *L) {
1040   std::string Result;
1041   if (L) {
1042     raw_string_ostream OS(Result);
1043     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1044       LoopDbgLoc.print(OS);
1045     else
1046       // Just print the module name.
1047       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1048     OS.flush();
1049   }
1050   return Result;
1051 }
1052 #endif
1053 
1054 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
1055                                          const Instruction *Orig) {
1056   // If the loop was versioned with memchecks, add the corresponding no-alias
1057   // metadata.
1058   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
1059     LVer->annotateInstWithNoAlias(To, Orig);
1060 }
1061 
1062 void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
1063     VPTransformState &State) {
1064 
1065   // Collect recipes in the backward slice of `Root` that may generate a poison
1066   // value that is used after vectorization.
1067   SmallPtrSet<VPRecipeBase *, 16> Visited;
1068   auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
1069     SmallVector<VPRecipeBase *, 16> Worklist;
1070     Worklist.push_back(Root);
1071 
1072     // Traverse the backward slice of Root through its use-def chain.
1073     while (!Worklist.empty()) {
1074       VPRecipeBase *CurRec = Worklist.back();
1075       Worklist.pop_back();
1076 
1077       if (!Visited.insert(CurRec).second)
1078         continue;
1079 
1080       // Prune search if we find another recipe generating a widen memory
1081       // instruction. Widen memory instructions involved in address computation
1082       // will lead to gather/scatter instructions, which don't need to be
1083       // handled.
1084       if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
1085           isa<VPInterleaveRecipe>(CurRec) ||
1086           isa<VPScalarIVStepsRecipe>(CurRec) ||
1087           isa<VPCanonicalIVPHIRecipe>(CurRec))
1088         continue;
1089 
1090       // This recipe contributes to the address computation of a widen
1091       // load/store. Collect recipe if its underlying instruction has
1092       // poison-generating flags.
1093       Instruction *Instr = CurRec->getUnderlyingInstr();
1094       if (Instr && Instr->hasPoisonGeneratingFlags())
1095         State.MayGeneratePoisonRecipes.insert(CurRec);
1096 
1097       // Add new definitions to the worklist.
1098       for (VPValue *operand : CurRec->operands())
1099         if (VPDef *OpDef = operand->getDef())
1100           Worklist.push_back(cast<VPRecipeBase>(OpDef));
1101     }
1102   });
1103 
1104   // Traverse all the recipes in the VPlan and collect the poison-generating
1105   // recipes in the backward slice starting at the address of a VPWidenRecipe or
1106   // VPInterleaveRecipe.
1107   auto Iter = depth_first(
1108       VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry()));
1109   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
1110     for (VPRecipeBase &Recipe : *VPBB) {
1111       if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
1112         Instruction &UnderlyingInstr = WidenRec->getIngredient();
1113         VPDef *AddrDef = WidenRec->getAddr()->getDef();
1114         if (AddrDef && WidenRec->isConsecutive() &&
1115             Legal->blockNeedsPredication(UnderlyingInstr.getParent()))
1116           collectPoisonGeneratingInstrsInBackwardSlice(
1117               cast<VPRecipeBase>(AddrDef));
1118       } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
1119         VPDef *AddrDef = InterleaveRec->getAddr()->getDef();
1120         if (AddrDef) {
1121           // Check if any member of the interleave group needs predication.
1122           const InterleaveGroup<Instruction> *InterGroup =
1123               InterleaveRec->getInterleaveGroup();
1124           bool NeedPredication = false;
1125           for (int I = 0, NumMembers = InterGroup->getNumMembers();
1126                I < NumMembers; ++I) {
1127             Instruction *Member = InterGroup->getMember(I);
1128             if (Member)
1129               NeedPredication |=
1130                   Legal->blockNeedsPredication(Member->getParent());
1131           }
1132 
1133           if (NeedPredication)
1134             collectPoisonGeneratingInstrsInBackwardSlice(
1135                 cast<VPRecipeBase>(AddrDef));
1136         }
1137       }
1138     }
1139   }
1140 }
1141 
1142 void InnerLoopVectorizer::addMetadata(Instruction *To,
1143                                       Instruction *From) {
1144   propagateMetadata(To, From);
1145   addNewMetadata(To, From);
1146 }
1147 
1148 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1149                                       Instruction *From) {
1150   for (Value *V : To) {
1151     if (Instruction *I = dyn_cast<Instruction>(V))
1152       addMetadata(I, From);
1153   }
1154 }
1155 
1156 PHINode *InnerLoopVectorizer::getReductionResumeValue(
1157     const RecurrenceDescriptor &RdxDesc) {
1158   auto It = ReductionResumeValues.find(&RdxDesc);
1159   assert(It != ReductionResumeValues.end() &&
1160          "Expected to find a resume value for the reduction.");
1161   return It->second;
1162 }
1163 
1164 namespace llvm {
1165 
1166 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1167 // lowered.
1168 enum ScalarEpilogueLowering {
1169 
1170   // The default: allowing scalar epilogues.
1171   CM_ScalarEpilogueAllowed,
1172 
1173   // Vectorization with OptForSize: don't allow epilogues.
1174   CM_ScalarEpilogueNotAllowedOptSize,
1175 
1176   // A special case of vectorisation with OptForSize: loops with a very small
1177   // trip count are considered for vectorization under OptForSize, thereby
1178   // making sure the cost of their loop body is dominant, free of runtime
1179   // guards and scalar iteration overheads.
1180   CM_ScalarEpilogueNotAllowedLowTripLoop,
1181 
1182   // Loop hint predicate indicating an epilogue is undesired.
1183   CM_ScalarEpilogueNotNeededUsePredicate,
1184 
1185   // Directive indicating we must either tail fold or not vectorize
1186   CM_ScalarEpilogueNotAllowedUsePredicate
1187 };
1188 
1189 /// ElementCountComparator creates a total ordering for ElementCount
1190 /// for the purposes of using it in a set structure.
1191 struct ElementCountComparator {
1192   bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {
1193     return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
1194            std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
1195   }
1196 };
1197 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;
1198 
1199 /// LoopVectorizationCostModel - estimates the expected speedups due to
1200 /// vectorization.
1201 /// In many cases vectorization is not profitable. This can happen because of
1202 /// a number of reasons. In this class we mainly attempt to predict the
1203 /// expected speedup/slowdowns due to the supported instruction set. We use the
1204 /// TargetTransformInfo to query the different backends for the cost of
1205 /// different operations.
1206 class LoopVectorizationCostModel {
1207 public:
1208   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1209                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1210                              LoopVectorizationLegality *Legal,
1211                              const TargetTransformInfo &TTI,
1212                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1213                              AssumptionCache *AC,
1214                              OptimizationRemarkEmitter *ORE, const Function *F,
1215                              const LoopVectorizeHints *Hints,
1216                              InterleavedAccessInfo &IAI)
1217       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1218         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1219         Hints(Hints), InterleaveInfo(IAI) {}
1220 
1221   /// \return An upper bound for the vectorization factors (both fixed and
1222   /// scalable). If the factors are 0, vectorization and interleaving should be
1223   /// avoided up front.
1224   FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1225 
1226   /// \return True if runtime checks are required for vectorization, and false
1227   /// otherwise.
1228   bool runtimeChecksRequired();
1229 
1230   /// \return The most profitable vectorization factor and the cost of that VF.
1231   /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO
1232   /// then this vectorization factor will be selected if vectorization is
1233   /// possible.
1234   VectorizationFactor
1235   selectVectorizationFactor(const ElementCountSet &CandidateVFs);
1236 
1237   VectorizationFactor
1238   selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1239                                     const LoopVectorizationPlanner &LVP);
1240 
1241   /// Setup cost-based decisions for user vectorization factor.
1242   /// \return true if the UserVF is a feasible VF to be chosen.
1243   bool selectUserVectorizationFactor(ElementCount UserVF) {
1244     collectUniformsAndScalars(UserVF);
1245     collectInstsToScalarize(UserVF);
1246     return expectedCost(UserVF).first.isValid();
1247   }
1248 
1249   /// \return The size (in bits) of the smallest and widest types in the code
1250   /// that needs to be vectorized. We ignore values that remain scalar such as
1251   /// 64 bit loop indices.
1252   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1253 
1254   /// \return The desired interleave count.
1255   /// If interleave count has been specified by metadata it will be returned.
1256   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1257   /// are the selected vectorization factor and the cost of the selected VF.
1258   unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1259 
1260   /// Memory access instruction may be vectorized in more than one way.
1261   /// Form of instruction after vectorization depends on cost.
1262   /// This function takes cost-based decisions for Load/Store instructions
1263   /// and collects them in a map. This decisions map is used for building
1264   /// the lists of loop-uniform and loop-scalar instructions.
1265   /// The calculated cost is saved with widening decision in order to
1266   /// avoid redundant calculations.
1267   void setCostBasedWideningDecision(ElementCount VF);
1268 
1269   /// A struct that represents some properties of the register usage
1270   /// of a loop.
1271   struct RegisterUsage {
1272     /// Holds the number of loop invariant values that are used in the loop.
1273     /// The key is ClassID of target-provided register class.
1274     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1275     /// Holds the maximum number of concurrent live intervals in the loop.
1276     /// The key is ClassID of target-provided register class.
1277     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1278   };
1279 
1280   /// \return Returns information about the register usages of the loop for the
1281   /// given vectorization factors.
1282   SmallVector<RegisterUsage, 8>
1283   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1284 
1285   /// Collect values we want to ignore in the cost model.
1286   void collectValuesToIgnore();
1287 
1288   /// Collect all element types in the loop for which widening is needed.
1289   void collectElementTypesForWidening();
1290 
1291   /// Split reductions into those that happen in the loop, and those that happen
1292   /// outside. In loop reductions are collected into InLoopReductionChains.
1293   void collectInLoopReductions();
1294 
1295   /// Returns true if we should use strict in-order reductions for the given
1296   /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1297   /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1298   /// of FP operations.
1299   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1300     return !Hints->allowReordering() && RdxDesc.isOrdered();
1301   }
1302 
1303   /// \returns The smallest bitwidth each instruction can be represented with.
1304   /// The vector equivalents of these instructions should be truncated to this
1305   /// type.
1306   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1307     return MinBWs;
1308   }
1309 
1310   /// \returns True if it is more profitable to scalarize instruction \p I for
1311   /// vectorization factor \p VF.
1312   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1313     assert(VF.isVector() &&
1314            "Profitable to scalarize relevant only for VF > 1.");
1315 
1316     // Cost model is not run in the VPlan-native path - return conservative
1317     // result until this changes.
1318     if (EnableVPlanNativePath)
1319       return false;
1320 
1321     auto Scalars = InstsToScalarize.find(VF);
1322     assert(Scalars != InstsToScalarize.end() &&
1323            "VF not yet analyzed for scalarization profitability");
1324     return Scalars->second.find(I) != Scalars->second.end();
1325   }
1326 
1327   /// Returns true if \p I is known to be uniform after vectorization.
1328   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1329     if (VF.isScalar())
1330       return true;
1331 
1332     // Cost model is not run in the VPlan-native path - return conservative
1333     // result until this changes.
1334     if (EnableVPlanNativePath)
1335       return false;
1336 
1337     auto UniformsPerVF = Uniforms.find(VF);
1338     assert(UniformsPerVF != Uniforms.end() &&
1339            "VF not yet analyzed for uniformity");
1340     return UniformsPerVF->second.count(I);
1341   }
1342 
1343   /// Returns true if \p I is known to be scalar after vectorization.
1344   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1345     if (VF.isScalar())
1346       return true;
1347 
1348     // Cost model is not run in the VPlan-native path - return conservative
1349     // result until this changes.
1350     if (EnableVPlanNativePath)
1351       return false;
1352 
1353     auto ScalarsPerVF = Scalars.find(VF);
1354     assert(ScalarsPerVF != Scalars.end() &&
1355            "Scalar values are not calculated for VF");
1356     return ScalarsPerVF->second.count(I);
1357   }
1358 
1359   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1360   /// for vectorization factor \p VF.
1361   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1362     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1363            !isProfitableToScalarize(I, VF) &&
1364            !isScalarAfterVectorization(I, VF);
1365   }
1366 
1367   /// Decision that was taken during cost calculation for memory instruction.
1368   enum InstWidening {
1369     CM_Unknown,
1370     CM_Widen,         // For consecutive accesses with stride +1.
1371     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1372     CM_Interleave,
1373     CM_GatherScatter,
1374     CM_Scalarize
1375   };
1376 
1377   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1378   /// instruction \p I and vector width \p VF.
1379   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1380                            InstructionCost Cost) {
1381     assert(VF.isVector() && "Expected VF >=2");
1382     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1383   }
1384 
1385   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1386   /// interleaving group \p Grp and vector width \p VF.
1387   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1388                            ElementCount VF, InstWidening W,
1389                            InstructionCost Cost) {
1390     assert(VF.isVector() && "Expected VF >=2");
1391     /// Broadcast this decicion to all instructions inside the group.
1392     /// But the cost will be assigned to one instruction only.
1393     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1394       if (auto *I = Grp->getMember(i)) {
1395         if (Grp->getInsertPos() == I)
1396           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1397         else
1398           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1399       }
1400     }
1401   }
1402 
1403   /// Return the cost model decision for the given instruction \p I and vector
1404   /// width \p VF. Return CM_Unknown if this instruction did not pass
1405   /// through the cost modeling.
1406   InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1407     assert(VF.isVector() && "Expected VF to be a vector VF");
1408     // Cost model is not run in the VPlan-native path - return conservative
1409     // result until this changes.
1410     if (EnableVPlanNativePath)
1411       return CM_GatherScatter;
1412 
1413     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1414     auto Itr = WideningDecisions.find(InstOnVF);
1415     if (Itr == WideningDecisions.end())
1416       return CM_Unknown;
1417     return Itr->second.first;
1418   }
1419 
1420   /// Return the vectorization cost for the given instruction \p I and vector
1421   /// width \p VF.
1422   InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1423     assert(VF.isVector() && "Expected VF >=2");
1424     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1425     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1426            "The cost is not calculated");
1427     return WideningDecisions[InstOnVF].second;
1428   }
1429 
1430   /// Return True if instruction \p I is an optimizable truncate whose operand
1431   /// is an induction variable. Such a truncate will be removed by adding a new
1432   /// induction variable with the destination type.
1433   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1434     // If the instruction is not a truncate, return false.
1435     auto *Trunc = dyn_cast<TruncInst>(I);
1436     if (!Trunc)
1437       return false;
1438 
1439     // Get the source and destination types of the truncate.
1440     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1441     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1442 
1443     // If the truncate is free for the given types, return false. Replacing a
1444     // free truncate with an induction variable would add an induction variable
1445     // update instruction to each iteration of the loop. We exclude from this
1446     // check the primary induction variable since it will need an update
1447     // instruction regardless.
1448     Value *Op = Trunc->getOperand(0);
1449     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1450       return false;
1451 
1452     // If the truncated value is not an induction variable, return false.
1453     return Legal->isInductionPhi(Op);
1454   }
1455 
1456   /// Collects the instructions to scalarize for each predicated instruction in
1457   /// the loop.
1458   void collectInstsToScalarize(ElementCount VF);
1459 
1460   /// Collect Uniform and Scalar values for the given \p VF.
1461   /// The sets depend on CM decision for Load/Store instructions
1462   /// that may be vectorized as interleave, gather-scatter or scalarized.
1463   void collectUniformsAndScalars(ElementCount VF) {
1464     // Do the analysis once.
1465     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1466       return;
1467     setCostBasedWideningDecision(VF);
1468     collectLoopUniforms(VF);
1469     collectLoopScalars(VF);
1470   }
1471 
1472   /// Returns true if the target machine supports masked store operation
1473   /// for the given \p DataType and kind of access to \p Ptr.
1474   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1475     return Legal->isConsecutivePtr(DataType, Ptr) &&
1476            TTI.isLegalMaskedStore(DataType, Alignment);
1477   }
1478 
1479   /// Returns true if the target machine supports masked load operation
1480   /// for the given \p DataType and kind of access to \p Ptr.
1481   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1482     return Legal->isConsecutivePtr(DataType, Ptr) &&
1483            TTI.isLegalMaskedLoad(DataType, Alignment);
1484   }
1485 
1486   /// Returns true if the target machine can represent \p V as a masked gather
1487   /// or scatter operation.
1488   bool isLegalGatherOrScatter(Value *V,
1489                               ElementCount VF = ElementCount::getFixed(1)) {
1490     bool LI = isa<LoadInst>(V);
1491     bool SI = isa<StoreInst>(V);
1492     if (!LI && !SI)
1493       return false;
1494     auto *Ty = getLoadStoreType(V);
1495     Align Align = getLoadStoreAlignment(V);
1496     if (VF.isVector())
1497       Ty = VectorType::get(Ty, VF);
1498     return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1499            (SI && TTI.isLegalMaskedScatter(Ty, Align));
1500   }
1501 
1502   /// Returns true if the target machine supports all of the reduction
1503   /// variables found for the given VF.
1504   bool canVectorizeReductions(ElementCount VF) const {
1505     return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1506       const RecurrenceDescriptor &RdxDesc = Reduction.second;
1507       return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1508     }));
1509   }
1510 
1511   /// Returns true if \p I is an instruction that will be scalarized with
1512   /// predication when vectorizing \p I with vectorization factor \p VF. Such
1513   /// instructions include conditional stores and instructions that may divide
1514   /// by zero.
1515   bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1516 
1517   // Returns true if \p I is an instruction that will be predicated either
1518   // through scalar predication or masked load/store or masked gather/scatter.
1519   // \p VF is the vectorization factor that will be used to vectorize \p I.
1520   // Superset of instructions that return true for isScalarWithPredication.
1521   bool isPredicatedInst(Instruction *I, ElementCount VF,
1522                         bool IsKnownUniform = false) {
1523     // When we know the load is uniform and the original scalar loop was not
1524     // predicated we don't need to mark it as a predicated instruction. Any
1525     // vectorised blocks created when tail-folding are something artificial we
1526     // have introduced and we know there is always at least one active lane.
1527     // That's why we call Legal->blockNeedsPredication here because it doesn't
1528     // query tail-folding.
1529     if (IsKnownUniform && isa<LoadInst>(I) &&
1530         !Legal->blockNeedsPredication(I->getParent()))
1531       return false;
1532     if (!blockNeedsPredicationForAnyReason(I->getParent()))
1533       return false;
1534     // Loads and stores that need some form of masked operation are predicated
1535     // instructions.
1536     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1537       return Legal->isMaskRequired(I);
1538     return isScalarWithPredication(I, VF);
1539   }
1540 
1541   /// Returns true if \p I is a memory instruction with consecutive memory
1542   /// access that can be widened.
1543   bool
1544   memoryInstructionCanBeWidened(Instruction *I,
1545                                 ElementCount VF = ElementCount::getFixed(1));
1546 
1547   /// Returns true if \p I is a memory instruction in an interleaved-group
1548   /// of memory accesses that can be vectorized with wide vector loads/stores
1549   /// and shuffles.
1550   bool
1551   interleavedAccessCanBeWidened(Instruction *I,
1552                                 ElementCount VF = ElementCount::getFixed(1));
1553 
1554   /// Check if \p Instr belongs to any interleaved access group.
1555   bool isAccessInterleaved(Instruction *Instr) {
1556     return InterleaveInfo.isInterleaved(Instr);
1557   }
1558 
1559   /// Get the interleaved access group that \p Instr belongs to.
1560   const InterleaveGroup<Instruction> *
1561   getInterleavedAccessGroup(Instruction *Instr) {
1562     return InterleaveInfo.getInterleaveGroup(Instr);
1563   }
1564 
1565   /// Returns true if we're required to use a scalar epilogue for at least
1566   /// the final iteration of the original loop.
1567   bool requiresScalarEpilogue(ElementCount VF) const {
1568     if (!isScalarEpilogueAllowed())
1569       return false;
1570     // If we might exit from anywhere but the latch, must run the exiting
1571     // iteration in scalar form.
1572     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1573       return true;
1574     return VF.isVector() && InterleaveInfo.requiresScalarEpilogue();
1575   }
1576 
1577   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1578   /// loop hint annotation.
1579   bool isScalarEpilogueAllowed() const {
1580     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1581   }
1582 
1583   /// Returns true if all loop blocks should be masked to fold tail loop.
1584   bool foldTailByMasking() const { return FoldTailByMasking; }
1585 
1586   /// Returns true if the instructions in this block requires predication
1587   /// for any reason, e.g. because tail folding now requires a predicate
1588   /// or because the block in the original loop was predicated.
1589   bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1590     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1591   }
1592 
1593   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1594   /// nodes to the chain of instructions representing the reductions. Uses a
1595   /// MapVector to ensure deterministic iteration order.
1596   using ReductionChainMap =
1597       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1598 
1599   /// Return the chain of instructions representing an inloop reduction.
1600   const ReductionChainMap &getInLoopReductionChains() const {
1601     return InLoopReductionChains;
1602   }
1603 
1604   /// Returns true if the Phi is part of an inloop reduction.
1605   bool isInLoopReduction(PHINode *Phi) const {
1606     return InLoopReductionChains.count(Phi);
1607   }
1608 
1609   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1610   /// with factor VF.  Return the cost of the instruction, including
1611   /// scalarization overhead if it's needed.
1612   InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1613 
1614   /// Estimate cost of a call instruction CI if it were vectorized with factor
1615   /// VF. Return the cost of the instruction, including scalarization overhead
1616   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1617   /// scalarized -
1618   /// i.e. either vector version isn't available, or is too expensive.
1619   InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
1620                                     bool &NeedToScalarize) const;
1621 
1622   /// Returns true if the per-lane cost of VectorizationFactor A is lower than
1623   /// that of B.
1624   bool isMoreProfitable(const VectorizationFactor &A,
1625                         const VectorizationFactor &B) const;
1626 
1627   /// Invalidates decisions already taken by the cost model.
1628   void invalidateCostModelingDecisions() {
1629     WideningDecisions.clear();
1630     Uniforms.clear();
1631     Scalars.clear();
1632   }
1633 
1634 private:
1635   unsigned NumPredStores = 0;
1636 
1637   /// Convenience function that returns the value of vscale_range iff
1638   /// vscale_range.min == vscale_range.max or otherwise returns the value
1639   /// returned by the corresponding TLI method.
1640   Optional<unsigned> getVScaleForTuning() const;
1641 
1642   /// \return An upper bound for the vectorization factors for both
1643   /// fixed and scalable vectorization, where the minimum-known number of
1644   /// elements is a power-of-2 larger than zero. If scalable vectorization is
1645   /// disabled or unsupported, then the scalable part will be equal to
1646   /// ElementCount::getScalable(0).
1647   FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
1648                                            ElementCount UserVF,
1649                                            bool FoldTailByMasking);
1650 
1651   /// \return the maximized element count based on the targets vector
1652   /// registers and the loop trip-count, but limited to a maximum safe VF.
1653   /// This is a helper function of computeFeasibleMaxVF.
1654   ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
1655                                        unsigned SmallestType,
1656                                        unsigned WidestType,
1657                                        ElementCount MaxSafeVF,
1658                                        bool FoldTailByMasking);
1659 
1660   /// \return the maximum legal scalable VF, based on the safe max number
1661   /// of elements.
1662   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1663 
1664   /// The vectorization cost is a combination of the cost itself and a boolean
1665   /// indicating whether any of the contributing operations will actually
1666   /// operate on vector values after type legalization in the backend. If this
1667   /// latter value is false, then all operations will be scalarized (i.e. no
1668   /// vectorization has actually taken place).
1669   using VectorizationCostTy = std::pair<InstructionCost, bool>;
1670 
1671   /// Returns the expected execution cost. The unit of the cost does
1672   /// not matter because we use the 'cost' units to compare different
1673   /// vector widths. The cost that is returned is *not* normalized by
1674   /// the factor width. If \p Invalid is not nullptr, this function
1675   /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1676   /// each instruction that has an Invalid cost for the given VF.
1677   using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1678   VectorizationCostTy
1679   expectedCost(ElementCount VF,
1680                SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
1681 
1682   /// Returns the execution time cost of an instruction for a given vector
1683   /// width. Vector width of one means scalar.
1684   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1685 
1686   /// The cost-computation logic from getInstructionCost which provides
1687   /// the vector type as an output parameter.
1688   InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1689                                      Type *&VectorTy);
1690 
1691   /// Return the cost of instructions in an inloop reduction pattern, if I is
1692   /// part of that pattern.
1693   Optional<InstructionCost>
1694   getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1695                           TTI::TargetCostKind CostKind);
1696 
1697   /// Calculate vectorization cost of memory instruction \p I.
1698   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1699 
1700   /// The cost computation for scalarized memory instruction.
1701   InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1702 
1703   /// The cost computation for interleaving group of memory instructions.
1704   InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1705 
1706   /// The cost computation for Gather/Scatter instruction.
1707   InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1708 
1709   /// The cost computation for widening instruction \p I with consecutive
1710   /// memory access.
1711   InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1712 
1713   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1714   /// Load: scalar load + broadcast.
1715   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1716   /// element)
1717   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1718 
1719   /// Estimate the overhead of scalarizing an instruction. This is a
1720   /// convenience wrapper for the type-based getScalarizationOverhead API.
1721   InstructionCost getScalarizationOverhead(Instruction *I,
1722                                            ElementCount VF) const;
1723 
1724   /// Returns whether the instruction is a load or store and will be a emitted
1725   /// as a vector operation.
1726   bool isConsecutiveLoadOrStore(Instruction *I);
1727 
1728   /// Returns true if an artificially high cost for emulated masked memrefs
1729   /// should be used.
1730   bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1731 
1732   /// Map of scalar integer values to the smallest bitwidth they can be legally
1733   /// represented as. The vector equivalents of these values should be truncated
1734   /// to this type.
1735   MapVector<Instruction *, uint64_t> MinBWs;
1736 
1737   /// A type representing the costs for instructions if they were to be
1738   /// scalarized rather than vectorized. The entries are Instruction-Cost
1739   /// pairs.
1740   using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1741 
1742   /// A set containing all BasicBlocks that are known to present after
1743   /// vectorization as a predicated block.
1744   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1745 
1746   /// Records whether it is allowed to have the original scalar loop execute at
1747   /// least once. This may be needed as a fallback loop in case runtime
1748   /// aliasing/dependence checks fail, or to handle the tail/remainder
1749   /// iterations when the trip count is unknown or doesn't divide by the VF,
1750   /// or as a peel-loop to handle gaps in interleave-groups.
1751   /// Under optsize and when the trip count is very small we don't allow any
1752   /// iterations to execute in the scalar loop.
1753   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1754 
1755   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1756   bool FoldTailByMasking = false;
1757 
1758   /// A map holding scalar costs for different vectorization factors. The
1759   /// presence of a cost for an instruction in the mapping indicates that the
1760   /// instruction will be scalarized when vectorizing with the associated
1761   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1762   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1763 
1764   /// Holds the instructions known to be uniform after vectorization.
1765   /// The data is collected per VF.
1766   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1767 
1768   /// Holds the instructions known to be scalar after vectorization.
1769   /// The data is collected per VF.
1770   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1771 
1772   /// Holds the instructions (address computations) that are forced to be
1773   /// scalarized.
1774   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1775 
1776   /// PHINodes of the reductions that should be expanded in-loop along with
1777   /// their associated chains of reduction operations, in program order from top
1778   /// (PHI) to bottom
1779   ReductionChainMap InLoopReductionChains;
1780 
1781   /// A Map of inloop reduction operations and their immediate chain operand.
1782   /// FIXME: This can be removed once reductions can be costed correctly in
1783   /// vplan. This was added to allow quick lookup to the inloop operations,
1784   /// without having to loop through InLoopReductionChains.
1785   DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1786 
1787   /// Returns the expected difference in cost from scalarizing the expression
1788   /// feeding a predicated instruction \p PredInst. The instructions to
1789   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1790   /// non-negative return value implies the expression will be scalarized.
1791   /// Currently, only single-use chains are considered for scalarization.
1792   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1793                               ElementCount VF);
1794 
1795   /// Collect the instructions that are uniform after vectorization. An
1796   /// instruction is uniform if we represent it with a single scalar value in
1797   /// the vectorized loop corresponding to each vector iteration. Examples of
1798   /// uniform instructions include pointer operands of consecutive or
1799   /// interleaved memory accesses. Note that although uniformity implies an
1800   /// instruction will be scalar, the reverse is not true. In general, a
1801   /// scalarized instruction will be represented by VF scalar values in the
1802   /// vectorized loop, each corresponding to an iteration of the original
1803   /// scalar loop.
1804   void collectLoopUniforms(ElementCount VF);
1805 
1806   /// Collect the instructions that are scalar after vectorization. An
1807   /// instruction is scalar if it is known to be uniform or will be scalarized
1808   /// during vectorization. collectLoopScalars should only add non-uniform nodes
1809   /// to the list if they are used by a load/store instruction that is marked as
1810   /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1811   /// VF values in the vectorized loop, each corresponding to an iteration of
1812   /// the original scalar loop.
1813   void collectLoopScalars(ElementCount VF);
1814 
1815   /// Keeps cost model vectorization decision and cost for instructions.
1816   /// Right now it is used for memory instructions only.
1817   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1818                                 std::pair<InstWidening, InstructionCost>>;
1819 
1820   DecisionList WideningDecisions;
1821 
1822   /// Returns true if \p V is expected to be vectorized and it needs to be
1823   /// extracted.
1824   bool needsExtract(Value *V, ElementCount VF) const {
1825     Instruction *I = dyn_cast<Instruction>(V);
1826     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1827         TheLoop->isLoopInvariant(I))
1828       return false;
1829 
1830     // Assume we can vectorize V (and hence we need extraction) if the
1831     // scalars are not computed yet. This can happen, because it is called
1832     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1833     // the scalars are collected. That should be a safe assumption in most
1834     // cases, because we check if the operands have vectorizable types
1835     // beforehand in LoopVectorizationLegality.
1836     return Scalars.find(VF) == Scalars.end() ||
1837            !isScalarAfterVectorization(I, VF);
1838   };
1839 
1840   /// Returns a range containing only operands needing to be extracted.
1841   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1842                                                    ElementCount VF) const {
1843     return SmallVector<Value *, 4>(make_filter_range(
1844         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1845   }
1846 
1847   /// Determines if we have the infrastructure to vectorize loop \p L and its
1848   /// epilogue, assuming the main loop is vectorized by \p VF.
1849   bool isCandidateForEpilogueVectorization(const Loop &L,
1850                                            const ElementCount VF) const;
1851 
1852   /// Returns true if epilogue vectorization is considered profitable, and
1853   /// false otherwise.
1854   /// \p VF is the vectorization factor chosen for the original loop.
1855   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1856 
1857 public:
1858   /// The loop that we evaluate.
1859   Loop *TheLoop;
1860 
1861   /// Predicated scalar evolution analysis.
1862   PredicatedScalarEvolution &PSE;
1863 
1864   /// Loop Info analysis.
1865   LoopInfo *LI;
1866 
1867   /// Vectorization legality.
1868   LoopVectorizationLegality *Legal;
1869 
1870   /// Vector target information.
1871   const TargetTransformInfo &TTI;
1872 
1873   /// Target Library Info.
1874   const TargetLibraryInfo *TLI;
1875 
1876   /// Demanded bits analysis.
1877   DemandedBits *DB;
1878 
1879   /// Assumption cache.
1880   AssumptionCache *AC;
1881 
1882   /// Interface to emit optimization remarks.
1883   OptimizationRemarkEmitter *ORE;
1884 
1885   const Function *TheFunction;
1886 
1887   /// Loop Vectorize Hint.
1888   const LoopVectorizeHints *Hints;
1889 
1890   /// The interleave access information contains groups of interleaved accesses
1891   /// with the same stride and close to each other.
1892   InterleavedAccessInfo &InterleaveInfo;
1893 
1894   /// Values to ignore in the cost model.
1895   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1896 
1897   /// Values to ignore in the cost model when VF > 1.
1898   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1899 
1900   /// All element types found in the loop.
1901   SmallPtrSet<Type *, 16> ElementTypesInLoop;
1902 
1903   /// Profitable vector factors.
1904   SmallVector<VectorizationFactor, 8> ProfitableVFs;
1905 };
1906 } // end namespace llvm
1907 
1908 /// Helper struct to manage generating runtime checks for vectorization.
1909 ///
1910 /// The runtime checks are created up-front in temporary blocks to allow better
1911 /// estimating the cost and un-linked from the existing IR. After deciding to
1912 /// vectorize, the checks are moved back. If deciding not to vectorize, the
1913 /// temporary blocks are completely removed.
1914 class GeneratedRTChecks {
1915   /// Basic block which contains the generated SCEV checks, if any.
1916   BasicBlock *SCEVCheckBlock = nullptr;
1917 
1918   /// The value representing the result of the generated SCEV checks. If it is
1919   /// nullptr, either no SCEV checks have been generated or they have been used.
1920   Value *SCEVCheckCond = nullptr;
1921 
1922   /// Basic block which contains the generated memory runtime checks, if any.
1923   BasicBlock *MemCheckBlock = nullptr;
1924 
1925   /// The value representing the result of the generated memory runtime checks.
1926   /// If it is nullptr, either no memory runtime checks have been generated or
1927   /// they have been used.
1928   Value *MemRuntimeCheckCond = nullptr;
1929 
1930   DominatorTree *DT;
1931   LoopInfo *LI;
1932 
1933   SCEVExpander SCEVExp;
1934   SCEVExpander MemCheckExp;
1935 
1936 public:
1937   GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1938                     const DataLayout &DL)
1939       : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"),
1940         MemCheckExp(SE, DL, "scev.check") {}
1941 
1942   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1943   /// accurately estimate the cost of the runtime checks. The blocks are
1944   /// un-linked from the IR and is added back during vector code generation. If
1945   /// there is no vector code generation, the check blocks are removed
1946   /// completely.
1947   void Create(Loop *L, const LoopAccessInfo &LAI,
1948               const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1949 
1950     BasicBlock *LoopHeader = L->getHeader();
1951     BasicBlock *Preheader = L->getLoopPreheader();
1952 
1953     // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1954     // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1955     // may be used by SCEVExpander. The blocks will be un-linked from their
1956     // predecessors and removed from LI & DT at the end of the function.
1957     if (!UnionPred.isAlwaysTrue()) {
1958       SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1959                                   nullptr, "vector.scevcheck");
1960 
1961       SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1962           &UnionPred, SCEVCheckBlock->getTerminator());
1963     }
1964 
1965     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1966     if (RtPtrChecking.Need) {
1967       auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1968       MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1969                                  "vector.memcheck");
1970 
1971       auto DiffChecks = RtPtrChecking.getDiffChecks();
1972       if (DiffChecks) {
1973         MemRuntimeCheckCond = addDiffRuntimeChecks(
1974             MemCheckBlock->getTerminator(), L, *DiffChecks, MemCheckExp,
1975             [VF](IRBuilderBase &B, unsigned Bits) {
1976               return getRuntimeVF(B, B.getIntNTy(Bits), VF);
1977             },
1978             IC);
1979       } else {
1980         MemRuntimeCheckCond =
1981             addRuntimeChecks(MemCheckBlock->getTerminator(), L,
1982                              RtPtrChecking.getChecks(), MemCheckExp);
1983       }
1984       assert(MemRuntimeCheckCond &&
1985              "no RT checks generated although RtPtrChecking "
1986              "claimed checks are required");
1987     }
1988 
1989     if (!MemCheckBlock && !SCEVCheckBlock)
1990       return;
1991 
1992     // Unhook the temporary block with the checks, update various places
1993     // accordingly.
1994     if (SCEVCheckBlock)
1995       SCEVCheckBlock->replaceAllUsesWith(Preheader);
1996     if (MemCheckBlock)
1997       MemCheckBlock->replaceAllUsesWith(Preheader);
1998 
1999     if (SCEVCheckBlock) {
2000       SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2001       new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
2002       Preheader->getTerminator()->eraseFromParent();
2003     }
2004     if (MemCheckBlock) {
2005       MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2006       new UnreachableInst(Preheader->getContext(), MemCheckBlock);
2007       Preheader->getTerminator()->eraseFromParent();
2008     }
2009 
2010     DT->changeImmediateDominator(LoopHeader, Preheader);
2011     if (MemCheckBlock) {
2012       DT->eraseNode(MemCheckBlock);
2013       LI->removeBlock(MemCheckBlock);
2014     }
2015     if (SCEVCheckBlock) {
2016       DT->eraseNode(SCEVCheckBlock);
2017       LI->removeBlock(SCEVCheckBlock);
2018     }
2019   }
2020 
2021   /// Remove the created SCEV & memory runtime check blocks & instructions, if
2022   /// unused.
2023   ~GeneratedRTChecks() {
2024     SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2025     SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2026     if (!SCEVCheckCond)
2027       SCEVCleaner.markResultUsed();
2028 
2029     if (!MemRuntimeCheckCond)
2030       MemCheckCleaner.markResultUsed();
2031 
2032     if (MemRuntimeCheckCond) {
2033       auto &SE = *MemCheckExp.getSE();
2034       // Memory runtime check generation creates compares that use expanded
2035       // values. Remove them before running the SCEVExpanderCleaners.
2036       for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2037         if (MemCheckExp.isInsertedInstruction(&I))
2038           continue;
2039         SE.forgetValue(&I);
2040         I.eraseFromParent();
2041       }
2042     }
2043     MemCheckCleaner.cleanup();
2044     SCEVCleaner.cleanup();
2045 
2046     if (SCEVCheckCond)
2047       SCEVCheckBlock->eraseFromParent();
2048     if (MemRuntimeCheckCond)
2049       MemCheckBlock->eraseFromParent();
2050   }
2051 
2052   /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2053   /// adjusts the branches to branch to the vector preheader or \p Bypass,
2054   /// depending on the generated condition.
2055   BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2056                              BasicBlock *LoopVectorPreHeader,
2057                              BasicBlock *LoopExitBlock) {
2058     if (!SCEVCheckCond)
2059       return nullptr;
2060 
2061     Value *Cond = SCEVCheckCond;
2062     // Mark the check as used, to prevent it from being removed during cleanup.
2063     SCEVCheckCond = nullptr;
2064     if (auto *C = dyn_cast<ConstantInt>(Cond))
2065       if (C->isZero())
2066         return nullptr;
2067 
2068     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2069 
2070     BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2071     // Create new preheader for vector loop.
2072     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2073       PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2074 
2075     SCEVCheckBlock->getTerminator()->eraseFromParent();
2076     SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2077     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2078                                                 SCEVCheckBlock);
2079 
2080     DT->addNewBlock(SCEVCheckBlock, Pred);
2081     DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2082 
2083     ReplaceInstWithInst(SCEVCheckBlock->getTerminator(),
2084                         BranchInst::Create(Bypass, LoopVectorPreHeader, Cond));
2085     return SCEVCheckBlock;
2086   }
2087 
2088   /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2089   /// the branches to branch to the vector preheader or \p Bypass, depending on
2090   /// the generated condition.
2091   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2092                                    BasicBlock *LoopVectorPreHeader) {
2093     // Check if we generated code that checks in runtime if arrays overlap.
2094     if (!MemRuntimeCheckCond)
2095       return nullptr;
2096 
2097     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2098     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2099                                                 MemCheckBlock);
2100 
2101     DT->addNewBlock(MemCheckBlock, Pred);
2102     DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2103     MemCheckBlock->moveBefore(LoopVectorPreHeader);
2104 
2105     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2106       PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2107 
2108     ReplaceInstWithInst(
2109         MemCheckBlock->getTerminator(),
2110         BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
2111     MemCheckBlock->getTerminator()->setDebugLoc(
2112         Pred->getTerminator()->getDebugLoc());
2113 
2114     // Mark the check as used, to prevent it from being removed during cleanup.
2115     MemRuntimeCheckCond = nullptr;
2116     return MemCheckBlock;
2117   }
2118 };
2119 
2120 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2121 // vectorization. The loop needs to be annotated with #pragma omp simd
2122 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2123 // vector length information is not provided, vectorization is not considered
2124 // explicit. Interleave hints are not allowed either. These limitations will be
2125 // relaxed in the future.
2126 // Please, note that we are currently forced to abuse the pragma 'clang
2127 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2128 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2129 // provides *explicit vectorization hints* (LV can bypass legal checks and
2130 // assume that vectorization is legal). However, both hints are implemented
2131 // using the same metadata (llvm.loop.vectorize, processed by
2132 // LoopVectorizeHints). This will be fixed in the future when the native IR
2133 // representation for pragma 'omp simd' is introduced.
2134 static bool isExplicitVecOuterLoop(Loop *OuterLp,
2135                                    OptimizationRemarkEmitter *ORE) {
2136   assert(!OuterLp->isInnermost() && "This is not an outer loop");
2137   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2138 
2139   // Only outer loops with an explicit vectorization hint are supported.
2140   // Unannotated outer loops are ignored.
2141   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2142     return false;
2143 
2144   Function *Fn = OuterLp->getHeader()->getParent();
2145   if (!Hints.allowVectorization(Fn, OuterLp,
2146                                 true /*VectorizeOnlyWhenForced*/)) {
2147     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2148     return false;
2149   }
2150 
2151   if (Hints.getInterleave() > 1) {
2152     // TODO: Interleave support is future work.
2153     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2154                          "outer loops.\n");
2155     Hints.emitRemarkWithHints();
2156     return false;
2157   }
2158 
2159   return true;
2160 }
2161 
2162 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2163                                   OptimizationRemarkEmitter *ORE,
2164                                   SmallVectorImpl<Loop *> &V) {
2165   // Collect inner loops and outer loops without irreducible control flow. For
2166   // now, only collect outer loops that have explicit vectorization hints. If we
2167   // are stress testing the VPlan H-CFG construction, we collect the outermost
2168   // loop of every loop nest.
2169   if (L.isInnermost() || VPlanBuildStressTest ||
2170       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2171     LoopBlocksRPO RPOT(&L);
2172     RPOT.perform(LI);
2173     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2174       V.push_back(&L);
2175       // TODO: Collect inner loops inside marked outer loops in case
2176       // vectorization fails for the outer loop. Do not invoke
2177       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2178       // already known to be reducible. We can use an inherited attribute for
2179       // that.
2180       return;
2181     }
2182   }
2183   for (Loop *InnerL : L)
2184     collectSupportedLoops(*InnerL, LI, ORE, V);
2185 }
2186 
2187 namespace {
2188 
2189 /// The LoopVectorize Pass.
2190 struct LoopVectorize : public FunctionPass {
2191   /// Pass identification, replacement for typeid
2192   static char ID;
2193 
2194   LoopVectorizePass Impl;
2195 
2196   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
2197                          bool VectorizeOnlyWhenForced = false)
2198       : FunctionPass(ID),
2199         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
2200     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
2201   }
2202 
2203   bool runOnFunction(Function &F) override {
2204     if (skipFunction(F))
2205       return false;
2206 
2207     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2208     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2209     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2210     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2211     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2212     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2213     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
2214     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2215     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2216     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
2217     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2218     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2219     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
2220 
2221     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
2222         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
2223 
2224     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
2225                         GetLAA, *ORE, PSI).MadeAnyChange;
2226   }
2227 
2228   void getAnalysisUsage(AnalysisUsage &AU) const override {
2229     AU.addRequired<AssumptionCacheTracker>();
2230     AU.addRequired<BlockFrequencyInfoWrapperPass>();
2231     AU.addRequired<DominatorTreeWrapperPass>();
2232     AU.addRequired<LoopInfoWrapperPass>();
2233     AU.addRequired<ScalarEvolutionWrapperPass>();
2234     AU.addRequired<TargetTransformInfoWrapperPass>();
2235     AU.addRequired<AAResultsWrapperPass>();
2236     AU.addRequired<LoopAccessLegacyAnalysis>();
2237     AU.addRequired<DemandedBitsWrapperPass>();
2238     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
2239     AU.addRequired<InjectTLIMappingsLegacy>();
2240 
2241     // We currently do not preserve loopinfo/dominator analyses with outer loop
2242     // vectorization. Until this is addressed, mark these analyses as preserved
2243     // only for non-VPlan-native path.
2244     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
2245     if (!EnableVPlanNativePath) {
2246       AU.addPreserved<LoopInfoWrapperPass>();
2247       AU.addPreserved<DominatorTreeWrapperPass>();
2248     }
2249 
2250     AU.addPreserved<BasicAAWrapperPass>();
2251     AU.addPreserved<GlobalsAAWrapperPass>();
2252     AU.addRequired<ProfileSummaryInfoWrapperPass>();
2253   }
2254 };
2255 
2256 } // end anonymous namespace
2257 
2258 //===----------------------------------------------------------------------===//
2259 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2260 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2261 //===----------------------------------------------------------------------===//
2262 
2263 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2264   // We need to place the broadcast of invariant variables outside the loop,
2265   // but only if it's proven safe to do so. Else, broadcast will be inside
2266   // vector loop body.
2267   Instruction *Instr = dyn_cast<Instruction>(V);
2268   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
2269                      (!Instr ||
2270                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
2271   // Place the code for broadcasting invariant variables in the new preheader.
2272   IRBuilder<>::InsertPointGuard Guard(Builder);
2273   if (SafeToHoist)
2274     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2275 
2276   // Broadcast the scalar into all locations in the vector.
2277   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2278 
2279   return Shuf;
2280 }
2281 
2282 /// This function adds
2283 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
2284 /// to each vector element of Val. The sequence starts at StartIndex.
2285 /// \p Opcode is relevant for FP induction variable.
2286 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
2287                             Instruction::BinaryOps BinOp, ElementCount VF,
2288                             IRBuilderBase &Builder) {
2289   assert(VF.isVector() && "only vector VFs are supported");
2290 
2291   // Create and check the types.
2292   auto *ValVTy = cast<VectorType>(Val->getType());
2293   ElementCount VLen = ValVTy->getElementCount();
2294 
2295   Type *STy = Val->getType()->getScalarType();
2296   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2297          "Induction Step must be an integer or FP");
2298   assert(Step->getType() == STy && "Step has wrong type");
2299 
2300   SmallVector<Constant *, 8> Indices;
2301 
2302   // Create a vector of consecutive numbers from zero to VF.
2303   VectorType *InitVecValVTy = ValVTy;
2304   if (STy->isFloatingPointTy()) {
2305     Type *InitVecValSTy =
2306         IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
2307     InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
2308   }
2309   Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
2310 
2311   // Splat the StartIdx
2312   Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
2313 
2314   if (STy->isIntegerTy()) {
2315     InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
2316     Step = Builder.CreateVectorSplat(VLen, Step);
2317     assert(Step->getType() == Val->getType() && "Invalid step vec");
2318     // FIXME: The newly created binary instructions should contain nsw/nuw
2319     // flags, which can be found from the original scalar operations.
2320     Step = Builder.CreateMul(InitVec, Step);
2321     return Builder.CreateAdd(Val, Step, "induction");
2322   }
2323 
2324   // Floating point induction.
2325   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2326          "Binary Opcode should be specified for FP induction");
2327   InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
2328   InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
2329 
2330   Step = Builder.CreateVectorSplat(VLen, Step);
2331   Value *MulOp = Builder.CreateFMul(InitVec, Step);
2332   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2333 }
2334 
2335 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
2336 /// variable on which to base the steps, \p Step is the size of the step.
2337 static void buildScalarSteps(Value *ScalarIV, Value *Step,
2338                              const InductionDescriptor &ID, VPValue *Def,
2339                              VPTransformState &State) {
2340   IRBuilderBase &Builder = State.Builder;
2341   // We shouldn't have to build scalar steps if we aren't vectorizing.
2342   assert(State.VF.isVector() && "VF should be greater than one");
2343   // Get the value type and ensure it and the step have the same integer type.
2344   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2345   assert(ScalarIVTy == Step->getType() &&
2346          "Val and Step should have the same type");
2347 
2348   // We build scalar steps for both integer and floating-point induction
2349   // variables. Here, we determine the kind of arithmetic we will perform.
2350   Instruction::BinaryOps AddOp;
2351   Instruction::BinaryOps MulOp;
2352   if (ScalarIVTy->isIntegerTy()) {
2353     AddOp = Instruction::Add;
2354     MulOp = Instruction::Mul;
2355   } else {
2356     AddOp = ID.getInductionOpcode();
2357     MulOp = Instruction::FMul;
2358   }
2359 
2360   // Determine the number of scalars we need to generate for each unroll
2361   // iteration.
2362   bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def);
2363   unsigned Lanes = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
2364   // Compute the scalar steps and save the results in State.
2365   Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2366                                      ScalarIVTy->getScalarSizeInBits());
2367   Type *VecIVTy = nullptr;
2368   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2369   if (!FirstLaneOnly && State.VF.isScalable()) {
2370     VecIVTy = VectorType::get(ScalarIVTy, State.VF);
2371     UnitStepVec =
2372         Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
2373     SplatStep = Builder.CreateVectorSplat(State.VF, Step);
2374     SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV);
2375   }
2376 
2377   for (unsigned Part = 0; Part < State.UF; ++Part) {
2378     Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
2379 
2380     if (!FirstLaneOnly && State.VF.isScalable()) {
2381       auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
2382       auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2383       if (ScalarIVTy->isFloatingPointTy())
2384         InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2385       auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2386       auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2387       State.set(Def, Add, Part);
2388       // It's useful to record the lane values too for the known minimum number
2389       // of elements so we do those below. This improves the code quality when
2390       // trying to extract the first element, for example.
2391     }
2392 
2393     if (ScalarIVTy->isFloatingPointTy())
2394       StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);
2395 
2396     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2397       Value *StartIdx = Builder.CreateBinOp(
2398           AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
2399       // The step returned by `createStepForVF` is a runtime-evaluated value
2400       // when VF is scalable. Otherwise, it should be folded into a Constant.
2401       assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
2402              "Expected StartIdx to be folded to a constant when VF is not "
2403              "scalable");
2404       auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2405       auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
2406       State.set(Def, Add, VPIteration(Part, Lane));
2407     }
2408   }
2409 }
2410 
2411 // Generate code for the induction step. Note that induction steps are
2412 // required to be loop-invariant
2413 static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE,
2414                               Instruction *InsertBefore,
2415                               Loop *OrigLoop = nullptr) {
2416   const DataLayout &DL = SE.getDataLayout();
2417   assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) &&
2418          "Induction step should be loop invariant");
2419   if (auto *E = dyn_cast<SCEVUnknown>(Step))
2420     return E->getValue();
2421 
2422   SCEVExpander Exp(SE, DL, "induction");
2423   return Exp.expandCodeFor(Step, Step->getType(), InsertBefore);
2424 }
2425 
2426 /// Compute the transformed value of Index at offset StartValue using step
2427 /// StepValue.
2428 /// For integer induction, returns StartValue + Index * StepValue.
2429 /// For pointer induction, returns StartValue[Index * StepValue].
2430 /// FIXME: The newly created binary instructions should contain nsw/nuw
2431 /// flags, which can be found from the original scalar operations.
2432 static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index,
2433                                    Value *StartValue, Value *Step,
2434                                    const InductionDescriptor &ID) {
2435   assert(Index->getType()->getScalarType() == Step->getType() &&
2436          "Index scalar type does not match StepValue type");
2437 
2438   // Note: the IR at this point is broken. We cannot use SE to create any new
2439   // SCEV and then expand it, hoping that SCEV's simplification will give us
2440   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2441   // lead to various SCEV crashes. So all we can do is to use builder and rely
2442   // on InstCombine for future simplifications. Here we handle some trivial
2443   // cases only.
2444   auto CreateAdd = [&B](Value *X, Value *Y) {
2445     assert(X->getType() == Y->getType() && "Types don't match!");
2446     if (auto *CX = dyn_cast<ConstantInt>(X))
2447       if (CX->isZero())
2448         return Y;
2449     if (auto *CY = dyn_cast<ConstantInt>(Y))
2450       if (CY->isZero())
2451         return X;
2452     return B.CreateAdd(X, Y);
2453   };
2454 
2455   // We allow X to be a vector type, in which case Y will potentially be
2456   // splatted into a vector with the same element count.
2457   auto CreateMul = [&B](Value *X, Value *Y) {
2458     assert(X->getType()->getScalarType() == Y->getType() &&
2459            "Types don't match!");
2460     if (auto *CX = dyn_cast<ConstantInt>(X))
2461       if (CX->isOne())
2462         return Y;
2463     if (auto *CY = dyn_cast<ConstantInt>(Y))
2464       if (CY->isOne())
2465         return X;
2466     VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2467     if (XVTy && !isa<VectorType>(Y->getType()))
2468       Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2469     return B.CreateMul(X, Y);
2470   };
2471 
2472   switch (ID.getKind()) {
2473   case InductionDescriptor::IK_IntInduction: {
2474     assert(!isa<VectorType>(Index->getType()) &&
2475            "Vector indices not supported for integer inductions yet");
2476     assert(Index->getType() == StartValue->getType() &&
2477            "Index type does not match StartValue type");
2478     if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2479       return B.CreateSub(StartValue, Index);
2480     auto *Offset = CreateMul(Index, Step);
2481     return CreateAdd(StartValue, Offset);
2482   }
2483   case InductionDescriptor::IK_PtrInduction: {
2484     assert(isa<Constant>(Step) &&
2485            "Expected constant step for pointer induction");
2486     return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step));
2487   }
2488   case InductionDescriptor::IK_FpInduction: {
2489     assert(!isa<VectorType>(Index->getType()) &&
2490            "Vector indices not supported for FP inductions yet");
2491     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2492     auto InductionBinOp = ID.getInductionBinOp();
2493     assert(InductionBinOp &&
2494            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2495             InductionBinOp->getOpcode() == Instruction::FSub) &&
2496            "Original bin op should be defined for FP induction");
2497 
2498     Value *MulExp = B.CreateFMul(Step, Index);
2499     return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2500                          "induction");
2501   }
2502   case InductionDescriptor::IK_NoInduction:
2503     return nullptr;
2504   }
2505   llvm_unreachable("invalid enum");
2506 }
2507 
2508 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
2509                                                     const VPIteration &Instance,
2510                                                     VPTransformState &State) {
2511   Value *ScalarInst = State.get(Def, Instance);
2512   Value *VectorValue = State.get(Def, Instance.Part);
2513   VectorValue = Builder.CreateInsertElement(
2514       VectorValue, ScalarInst,
2515       Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
2516   State.set(Def, VectorValue, Instance.Part);
2517 }
2518 
2519 // Return whether we allow using masked interleave-groups (for dealing with
2520 // strided loads/stores that reside in predicated blocks, or for dealing
2521 // with gaps).
2522 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2523   // If an override option has been passed in for interleaved accesses, use it.
2524   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2525     return EnableMaskedInterleavedMemAccesses;
2526 
2527   return TTI.enableMaskedInterleavedAccessVectorization();
2528 }
2529 
2530 // Try to vectorize the interleave group that \p Instr belongs to.
2531 //
2532 // E.g. Translate following interleaved load group (factor = 3):
2533 //   for (i = 0; i < N; i+=3) {
2534 //     R = Pic[i];             // Member of index 0
2535 //     G = Pic[i+1];           // Member of index 1
2536 //     B = Pic[i+2];           // Member of index 2
2537 //     ... // do something to R, G, B
2538 //   }
2539 // To:
2540 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2541 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2542 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2543 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
2544 //
2545 // Or translate following interleaved store group (factor = 3):
2546 //   for (i = 0; i < N; i+=3) {
2547 //     ... do something to R, G, B
2548 //     Pic[i]   = R;           // Member of index 0
2549 //     Pic[i+1] = G;           // Member of index 1
2550 //     Pic[i+2] = B;           // Member of index 2
2551 //   }
2552 // To:
2553 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2554 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2555 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2556 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2557 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2558 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2559     const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2560     VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2561     VPValue *BlockInMask) {
2562   Instruction *Instr = Group->getInsertPos();
2563   const DataLayout &DL = Instr->getModule()->getDataLayout();
2564 
2565   // Prepare for the vector type of the interleaved load/store.
2566   Type *ScalarTy = getLoadStoreType(Instr);
2567   unsigned InterleaveFactor = Group->getFactor();
2568   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2569   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2570 
2571   // Prepare for the new pointers.
2572   SmallVector<Value *, 2> AddrParts;
2573   unsigned Index = Group->getIndex(Instr);
2574 
2575   // TODO: extend the masked interleaved-group support to reversed access.
2576   assert((!BlockInMask || !Group->isReverse()) &&
2577          "Reversed masked interleave-group not supported.");
2578 
2579   // If the group is reverse, adjust the index to refer to the last vector lane
2580   // instead of the first. We adjust the index from the first vector lane,
2581   // rather than directly getting the pointer for lane VF - 1, because the
2582   // pointer operand of the interleaved access is supposed to be uniform. For
2583   // uniform instructions, we're only required to generate a value for the
2584   // first vector lane in each unroll iteration.
2585   if (Group->isReverse())
2586     Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2587 
2588   for (unsigned Part = 0; Part < UF; Part++) {
2589     Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2590     setDebugLocFromInst(AddrPart);
2591 
2592     // Notice current instruction could be any index. Need to adjust the address
2593     // to the member of index 0.
2594     //
2595     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2596     //       b = A[i];       // Member of index 0
2597     // Current pointer is pointed to A[i+1], adjust it to A[i].
2598     //
2599     // E.g.  A[i+1] = a;     // Member of index 1
2600     //       A[i]   = b;     // Member of index 0
2601     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2602     // Current pointer is pointed to A[i+2], adjust it to A[i].
2603 
2604     bool InBounds = false;
2605     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2606       InBounds = gep->isInBounds();
2607     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2608     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2609 
2610     // Cast to the vector pointer type.
2611     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2612     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2613     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2614   }
2615 
2616   setDebugLocFromInst(Instr);
2617   Value *PoisonVec = PoisonValue::get(VecTy);
2618 
2619   Value *MaskForGaps = nullptr;
2620   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2621     MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2622     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2623   }
2624 
2625   // Vectorize the interleaved load group.
2626   if (isa<LoadInst>(Instr)) {
2627     // For each unroll part, create a wide load for the group.
2628     SmallVector<Value *, 2> NewLoads;
2629     for (unsigned Part = 0; Part < UF; Part++) {
2630       Instruction *NewLoad;
2631       if (BlockInMask || MaskForGaps) {
2632         assert(useMaskedInterleavedAccesses(*TTI) &&
2633                "masked interleaved groups are not allowed.");
2634         Value *GroupMask = MaskForGaps;
2635         if (BlockInMask) {
2636           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2637           Value *ShuffledMask = Builder.CreateShuffleVector(
2638               BlockInMaskPart,
2639               createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2640               "interleaved.mask");
2641           GroupMask = MaskForGaps
2642                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2643                                                 MaskForGaps)
2644                           : ShuffledMask;
2645         }
2646         NewLoad =
2647             Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2648                                      GroupMask, PoisonVec, "wide.masked.vec");
2649       }
2650       else
2651         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2652                                             Group->getAlign(), "wide.vec");
2653       Group->addMetadata(NewLoad);
2654       NewLoads.push_back(NewLoad);
2655     }
2656 
2657     // For each member in the group, shuffle out the appropriate data from the
2658     // wide loads.
2659     unsigned J = 0;
2660     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2661       Instruction *Member = Group->getMember(I);
2662 
2663       // Skip the gaps in the group.
2664       if (!Member)
2665         continue;
2666 
2667       auto StrideMask =
2668           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2669       for (unsigned Part = 0; Part < UF; Part++) {
2670         Value *StridedVec = Builder.CreateShuffleVector(
2671             NewLoads[Part], StrideMask, "strided.vec");
2672 
2673         // If this member has different type, cast the result type.
2674         if (Member->getType() != ScalarTy) {
2675           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2676           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2677           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2678         }
2679 
2680         if (Group->isReverse())
2681           StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2682 
2683         State.set(VPDefs[J], StridedVec, Part);
2684       }
2685       ++J;
2686     }
2687     return;
2688   }
2689 
2690   // The sub vector type for current instruction.
2691   auto *SubVT = VectorType::get(ScalarTy, VF);
2692 
2693   // Vectorize the interleaved store group.
2694   MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2695   assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
2696          "masked interleaved groups are not allowed.");
2697   assert((!MaskForGaps || !VF.isScalable()) &&
2698          "masking gaps for scalable vectors is not yet supported.");
2699   for (unsigned Part = 0; Part < UF; Part++) {
2700     // Collect the stored vector from each member.
2701     SmallVector<Value *, 4> StoredVecs;
2702     for (unsigned i = 0; i < InterleaveFactor; i++) {
2703       assert((Group->getMember(i) || MaskForGaps) &&
2704              "Fail to get a member from an interleaved store group");
2705       Instruction *Member = Group->getMember(i);
2706 
2707       // Skip the gaps in the group.
2708       if (!Member) {
2709         Value *Undef = PoisonValue::get(SubVT);
2710         StoredVecs.push_back(Undef);
2711         continue;
2712       }
2713 
2714       Value *StoredVec = State.get(StoredValues[i], Part);
2715 
2716       if (Group->isReverse())
2717         StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");
2718 
2719       // If this member has different type, cast it to a unified type.
2720 
2721       if (StoredVec->getType() != SubVT)
2722         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2723 
2724       StoredVecs.push_back(StoredVec);
2725     }
2726 
2727     // Concatenate all vectors into a wide vector.
2728     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2729 
2730     // Interleave the elements in the wide vector.
2731     Value *IVec = Builder.CreateShuffleVector(
2732         WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2733         "interleaved.vec");
2734 
2735     Instruction *NewStoreInstr;
2736     if (BlockInMask || MaskForGaps) {
2737       Value *GroupMask = MaskForGaps;
2738       if (BlockInMask) {
2739         Value *BlockInMaskPart = State.get(BlockInMask, Part);
2740         Value *ShuffledMask = Builder.CreateShuffleVector(
2741             BlockInMaskPart,
2742             createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2743             "interleaved.mask");
2744         GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And,
2745                                                       ShuffledMask, MaskForGaps)
2746                                 : ShuffledMask;
2747       }
2748       NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2749                                                 Group->getAlign(), GroupMask);
2750     } else
2751       NewStoreInstr =
2752           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2753 
2754     Group->addMetadata(NewStoreInstr);
2755   }
2756 }
2757 
2758 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2759                                                VPReplicateRecipe *RepRecipe,
2760                                                const VPIteration &Instance,
2761                                                bool IfPredicateInstr,
2762                                                VPTransformState &State) {
2763   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2764 
2765   // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2766   // the first lane and part.
2767   if (isa<NoAliasScopeDeclInst>(Instr))
2768     if (!Instance.isFirstIteration())
2769       return;
2770 
2771   // Does this instruction return a value ?
2772   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2773 
2774   Instruction *Cloned = Instr->clone();
2775   if (!IsVoidRetTy)
2776     Cloned->setName(Instr->getName() + ".cloned");
2777 
2778   // If the scalarized instruction contributes to the address computation of a
2779   // widen masked load/store which was in a basic block that needed predication
2780   // and is not predicated after vectorization, we can't propagate
2781   // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized
2782   // instruction could feed a poison value to the base address of the widen
2783   // load/store.
2784   if (State.MayGeneratePoisonRecipes.contains(RepRecipe))
2785     Cloned->dropPoisonGeneratingFlags();
2786 
2787   if (Instr->getDebugLoc())
2788     setDebugLocFromInst(Instr);
2789 
2790   // Replace the operands of the cloned instructions with their scalar
2791   // equivalents in the new loop.
2792   for (auto &I : enumerate(RepRecipe->operands())) {
2793     auto InputInstance = Instance;
2794     VPValue *Operand = I.value();
2795     VPReplicateRecipe *OperandR = dyn_cast<VPReplicateRecipe>(Operand);
2796     if (OperandR && OperandR->isUniform())
2797       InputInstance.Lane = VPLane::getFirstLane();
2798     Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2799   }
2800   addNewMetadata(Cloned, Instr);
2801 
2802   // Place the cloned scalar in the new loop.
2803   State.Builder.Insert(Cloned);
2804 
2805   State.set(RepRecipe, Cloned, Instance);
2806 
2807   // If we just cloned a new assumption, add it the assumption cache.
2808   if (auto *II = dyn_cast<AssumeInst>(Cloned))
2809     AC->registerAssumption(II);
2810 
2811   // End if-block.
2812   if (IfPredicateInstr)
2813     PredicatedInstructions.push_back(Cloned);
2814 }
2815 
2816 Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) {
2817   if (TripCount)
2818     return TripCount;
2819 
2820   assert(InsertBlock);
2821   IRBuilder<> Builder(InsertBlock->getTerminator());
2822   // Find the loop boundaries.
2823   ScalarEvolution *SE = PSE.getSE();
2824   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2825   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
2826          "Invalid loop count");
2827 
2828   Type *IdxTy = Legal->getWidestInductionType();
2829   assert(IdxTy && "No type for induction");
2830 
2831   // The exit count might have the type of i64 while the phi is i32. This can
2832   // happen if we have an induction variable that is sign extended before the
2833   // compare. The only way that we get a backedge taken count is that the
2834   // induction variable was signed and as such will not overflow. In such a case
2835   // truncation is legal.
2836   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
2837       IdxTy->getPrimitiveSizeInBits())
2838     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2839   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2840 
2841   // Get the total trip count from the count by adding 1.
2842   const SCEV *ExitCount = SE->getAddExpr(
2843       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2844 
2845   const DataLayout &DL = InsertBlock->getModule()->getDataLayout();
2846 
2847   // Expand the trip count and place the new instructions in the preheader.
2848   // Notice that the pre-header does not change, only the loop body.
2849   SCEVExpander Exp(*SE, DL, "induction");
2850 
2851   // Count holds the overall loop count (N).
2852   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2853                                 InsertBlock->getTerminator());
2854 
2855   if (TripCount->getType()->isPointerTy())
2856     TripCount =
2857         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2858                                     InsertBlock->getTerminator());
2859 
2860   return TripCount;
2861 }
2862 
2863 Value *
2864 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
2865   if (VectorTripCount)
2866     return VectorTripCount;
2867 
2868   Value *TC = getOrCreateTripCount(InsertBlock);
2869   IRBuilder<> Builder(InsertBlock->getTerminator());
2870 
2871   Type *Ty = TC->getType();
2872   // This is where we can make the step a runtime constant.
2873   Value *Step = createStepForVF(Builder, Ty, VF, UF);
2874 
2875   // If the tail is to be folded by masking, round the number of iterations N
2876   // up to a multiple of Step instead of rounding down. This is done by first
2877   // adding Step-1 and then rounding down. Note that it's ok if this addition
2878   // overflows: the vector induction variable will eventually wrap to zero given
2879   // that it starts at zero and its Step is a power of two; the loop will then
2880   // exit, with the last early-exit vector comparison also producing all-true.
2881   // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2882   // is accounted for in emitIterationCountCheck that adds an overflow check.
2883   if (Cost->foldTailByMasking()) {
2884     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2885            "VF*UF must be a power of 2 when folding tail by masking");
2886     Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF);
2887     TC = Builder.CreateAdd(
2888         TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up");
2889   }
2890 
2891   // Now we need to generate the expression for the part of the loop that the
2892   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2893   // iterations are not required for correctness, or N - Step, otherwise. Step
2894   // is equal to the vectorization factor (number of SIMD elements) times the
2895   // unroll factor (number of SIMD instructions).
2896   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2897 
2898   // There are cases where we *must* run at least one iteration in the remainder
2899   // loop.  See the cost model for when this can happen.  If the step evenly
2900   // divides the trip count, we set the remainder to be equal to the step. If
2901   // the step does not evenly divide the trip count, no adjustment is necessary
2902   // since there will already be scalar iterations. Note that the minimum
2903   // iterations check ensures that N >= Step.
2904   if (Cost->requiresScalarEpilogue(VF)) {
2905     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2906     R = Builder.CreateSelect(IsZero, Step, R);
2907   }
2908 
2909   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2910 
2911   return VectorTripCount;
2912 }
2913 
2914 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2915                                                    const DataLayout &DL) {
2916   // Verify that V is a vector type with same number of elements as DstVTy.
2917   auto *DstFVTy = cast<FixedVectorType>(DstVTy);
2918   unsigned VF = DstFVTy->getNumElements();
2919   auto *SrcVecTy = cast<FixedVectorType>(V->getType());
2920   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2921   Type *SrcElemTy = SrcVecTy->getElementType();
2922   Type *DstElemTy = DstFVTy->getElementType();
2923   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2924          "Vector elements must have same size");
2925 
2926   // Do a direct cast if element types are castable.
2927   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2928     return Builder.CreateBitOrPointerCast(V, DstFVTy);
2929   }
2930   // V cannot be directly casted to desired vector type.
2931   // May happen when V is a floating point vector but DstVTy is a vector of
2932   // pointers or vice-versa. Handle this using a two-step bitcast using an
2933   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2934   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2935          "Only one type should be a pointer type");
2936   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2937          "Only one type should be a floating point type");
2938   Type *IntTy =
2939       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2940   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
2941   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2942   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
2943 }
2944 
2945 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
2946   Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
2947   // Reuse existing vector loop preheader for TC checks.
2948   // Note that new preheader block is generated for vector loop.
2949   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2950   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2951 
2952   // Generate code to check if the loop's trip count is less than VF * UF, or
2953   // equal to it in case a scalar epilogue is required; this implies that the
2954   // vector trip count is zero. This check also covers the case where adding one
2955   // to the backedge-taken count overflowed leading to an incorrect trip count
2956   // of zero. In this case we will also jump to the scalar loop.
2957   auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE
2958                                             : ICmpInst::ICMP_ULT;
2959 
2960   // If tail is to be folded, vector loop takes care of all iterations.
2961   Type *CountTy = Count->getType();
2962   Value *CheckMinIters = Builder.getFalse();
2963   Value *Step = createStepForVF(Builder, CountTy, VF, UF);
2964   if (!Cost->foldTailByMasking())
2965     CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
2966   else if (VF.isScalable()) {
2967     // vscale is not necessarily a power-of-2, which means we cannot guarantee
2968     // an overflow to zero when updating induction variables and so an
2969     // additional overflow check is required before entering the vector loop.
2970 
2971     // Get the maximum unsigned value for the type.
2972     Value *MaxUIntTripCount =
2973         ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2974     Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
2975 
2976     // Don't execute the vector loop if (UMax - n) < (VF * UF).
2977     CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, Step);
2978   }
2979   // Create new preheader for vector loop.
2980   LoopVectorPreHeader =
2981       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2982                  "vector.ph");
2983 
2984   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2985                                DT->getNode(Bypass)->getIDom()) &&
2986          "TC check is expected to dominate Bypass");
2987 
2988   // Update dominator for Bypass & LoopExit (if needed).
2989   DT->changeImmediateDominator(Bypass, TCCheckBlock);
2990   if (!Cost->requiresScalarEpilogue(VF))
2991     // If there is an epilogue which must run, there's no edge from the
2992     // middle block to exit blocks  and thus no need to update the immediate
2993     // dominator of the exit blocks.
2994     DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
2995 
2996   ReplaceInstWithInst(
2997       TCCheckBlock->getTerminator(),
2998       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
2999   LoopBypassBlocks.push_back(TCCheckBlock);
3000 }
3001 
3002 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
3003 
3004   BasicBlock *const SCEVCheckBlock =
3005       RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
3006   if (!SCEVCheckBlock)
3007     return nullptr;
3008 
3009   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
3010            (OptForSizeBasedOnProfile &&
3011             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
3012          "Cannot SCEV check stride or overflow when optimizing for size");
3013 
3014 
3015   // Update dominator only if this is first RT check.
3016   if (LoopBypassBlocks.empty()) {
3017     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3018     if (!Cost->requiresScalarEpilogue(VF))
3019       // If there is an epilogue which must run, there's no edge from the
3020       // middle block to exit blocks  and thus no need to update the immediate
3021       // dominator of the exit blocks.
3022       DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3023   }
3024 
3025   LoopBypassBlocks.push_back(SCEVCheckBlock);
3026   AddedSafetyChecks = true;
3027   return SCEVCheckBlock;
3028 }
3029 
3030 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
3031   // VPlan-native path does not do any analysis for runtime checks currently.
3032   if (EnableVPlanNativePath)
3033     return nullptr;
3034 
3035   BasicBlock *const MemCheckBlock =
3036       RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
3037 
3038   // Check if we generated code that checks in runtime if arrays overlap. We put
3039   // the checks into a separate block to make the more common case of few
3040   // elements faster.
3041   if (!MemCheckBlock)
3042     return nullptr;
3043 
3044   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3045     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
3046            "Cannot emit memory checks when optimizing for size, unless forced "
3047            "to vectorize.");
3048     ORE->emit([&]() {
3049       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
3050                                         OrigLoop->getStartLoc(),
3051                                         OrigLoop->getHeader())
3052              << "Code-size may be reduced by not forcing "
3053                 "vectorization, or by source-code modifications "
3054                 "eliminating the need for runtime checks "
3055                 "(e.g., adding 'restrict').";
3056     });
3057   }
3058 
3059   LoopBypassBlocks.push_back(MemCheckBlock);
3060 
3061   AddedSafetyChecks = true;
3062 
3063   // Only use noalias metadata when using memory checks guaranteeing no overlap
3064   // across all iterations.
3065   if (!Legal->getLAI()->getRuntimePointerChecking()->getDiffChecks()) {
3066     //  We currently don't use LoopVersioning for the actual loop cloning but we
3067     //  still use it to add the noalias metadata.
3068     LVer = std::make_unique<LoopVersioning>(
3069         *Legal->getLAI(),
3070         Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
3071         DT, PSE.getSE());
3072     LVer->prepareNoAliasMetadata();
3073   }
3074   return MemCheckBlock;
3075 }
3076 
3077 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3078   LoopScalarBody = OrigLoop->getHeader();
3079   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3080   assert(LoopVectorPreHeader && "Invalid loop structure");
3081   LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
3082   assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) &&
3083          "multiple exit loop without required epilogue?");
3084 
3085   LoopMiddleBlock =
3086       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3087                  LI, nullptr, Twine(Prefix) + "middle.block");
3088   LoopScalarPreHeader =
3089       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3090                  nullptr, Twine(Prefix) + "scalar.ph");
3091 
3092   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3093 
3094   // Set up the middle block terminator.  Two cases:
3095   // 1) If we know that we must execute the scalar epilogue, emit an
3096   //    unconditional branch.
3097   // 2) Otherwise, we must have a single unique exit block (due to how we
3098   //    implement the multiple exit case).  In this case, set up a conditonal
3099   //    branch from the middle block to the loop scalar preheader, and the
3100   //    exit block.  completeLoopSkeleton will update the condition to use an
3101   //    iteration check, if required to decide whether to execute the remainder.
3102   BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ?
3103     BranchInst::Create(LoopScalarPreHeader) :
3104     BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
3105                        Builder.getTrue());
3106   BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3107   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3108 
3109   // Update dominator for loop exit. During skeleton creation, only the vector
3110   // pre-header and the middle block are created. The vector loop is entirely
3111   // created during VPlan exection.
3112   if (!Cost->requiresScalarEpilogue(VF))
3113     // If there is an epilogue which must run, there's no edge from the
3114     // middle block to exit blocks  and thus no need to update the immediate
3115     // dominator of the exit blocks.
3116     DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3117 }
3118 
3119 void InnerLoopVectorizer::createInductionResumeValues(
3120     std::pair<BasicBlock *, Value *> AdditionalBypass) {
3121   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3122           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3123          "Inconsistent information about additional bypass.");
3124 
3125   Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3126   assert(VectorTripCount && "Expected valid arguments");
3127   // We are going to resume the execution of the scalar loop.
3128   // Go over all of the induction variables that we found and fix the
3129   // PHIs that are left in the scalar version of the loop.
3130   // The starting values of PHI nodes depend on the counter of the last
3131   // iteration in the vectorized loop.
3132   // If we come from a bypass edge then we need to start from the original
3133   // start value.
3134   Instruction *OldInduction = Legal->getPrimaryInduction();
3135   for (auto &InductionEntry : Legal->getInductionVars()) {
3136     PHINode *OrigPhi = InductionEntry.first;
3137     InductionDescriptor II = InductionEntry.second;
3138 
3139     // Create phi nodes to merge from the  backedge-taken check block.
3140     PHINode *BCResumeVal =
3141         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3142                         LoopScalarPreHeader->getTerminator());
3143     // Copy original phi DL over to the new one.
3144     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3145     Value *&EndValue = IVEndValues[OrigPhi];
3146     Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3147     if (OrigPhi == OldInduction) {
3148       // We know what the end value is.
3149       EndValue = VectorTripCount;
3150     } else {
3151       IRBuilder<> B(LoopVectorPreHeader->getTerminator());
3152 
3153       // Fast-math-flags propagate from the original induction instruction.
3154       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3155         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3156 
3157       Type *StepType = II.getStep()->getType();
3158       Instruction::CastOps CastOp =
3159           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3160       Value *VTC = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.vtc");
3161       Value *Step =
3162           CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
3163       EndValue = emitTransformedIndex(B, VTC, II.getStartValue(), Step, II);
3164       EndValue->setName("ind.end");
3165 
3166       // Compute the end value for the additional bypass (if applicable).
3167       if (AdditionalBypass.first) {
3168         B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3169         CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
3170                                          StepType, true);
3171         Value *Step =
3172             CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
3173         VTC =
3174             B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.vtc");
3175         EndValueFromAdditionalBypass =
3176             emitTransformedIndex(B, VTC, II.getStartValue(), Step, II);
3177         EndValueFromAdditionalBypass->setName("ind.end");
3178       }
3179     }
3180     // The new PHI merges the original incoming value, in case of a bypass,
3181     // or the value at the end of the vectorized loop.
3182     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3183 
3184     // Fix the scalar body counter (PHI node).
3185     // The old induction's phi node in the scalar body needs the truncated
3186     // value.
3187     for (BasicBlock *BB : LoopBypassBlocks)
3188       BCResumeVal->addIncoming(II.getStartValue(), BB);
3189 
3190     if (AdditionalBypass.first)
3191       BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3192                                             EndValueFromAdditionalBypass);
3193 
3194     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3195   }
3196 }
3197 
3198 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(MDNode *OrigLoopID) {
3199   // The trip counts should be cached by now.
3200   Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
3201   Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3202 
3203   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3204 
3205   // Add a check in the middle block to see if we have completed
3206   // all of the iterations in the first vector loop.  Three cases:
3207   // 1) If we require a scalar epilogue, there is no conditional branch as
3208   //    we unconditionally branch to the scalar preheader.  Do nothing.
3209   // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3210   //    Thus if tail is to be folded, we know we don't need to run the
3211   //    remainder and we can use the previous value for the condition (true).
3212   // 3) Otherwise, construct a runtime check.
3213   if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) {
3214     Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3215                                         Count, VectorTripCount, "cmp.n",
3216                                         LoopMiddleBlock->getTerminator());
3217 
3218     // Here we use the same DebugLoc as the scalar loop latch terminator instead
3219     // of the corresponding compare because they may have ended up with
3220     // different line numbers and we want to avoid awkward line stepping while
3221     // debugging. Eg. if the compare has got a line number inside the loop.
3222     CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3223     cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3224   }
3225 
3226 #ifdef EXPENSIVE_CHECKS
3227   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3228 #endif
3229 
3230   return LoopVectorPreHeader;
3231 }
3232 
3233 std::pair<BasicBlock *, Value *>
3234 InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3235   /*
3236    In this function we generate a new loop. The new loop will contain
3237    the vectorized instructions while the old loop will continue to run the
3238    scalar remainder.
3239 
3240        [ ] <-- loop iteration number check.
3241     /   |
3242    /    v
3243   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3244   |  /  |
3245   | /   v
3246   ||   [ ]     <-- vector pre header.
3247   |/    |
3248   |     v
3249   |    [  ] \
3250   |    [  ]_|   <-- vector loop (created during VPlan execution).
3251   |     |
3252   |     v
3253   \   -[ ]   <--- middle-block.
3254    \/   |
3255    /\   v
3256    | ->[ ]     <--- new preheader.
3257    |    |
3258  (opt)  v      <-- edge from middle to exit iff epilogue is not required.
3259    |   [ ] \
3260    |   [ ]_|   <-- old scalar loop to handle remainder (scalar epilogue).
3261     \   |
3262      \  v
3263       >[ ]     <-- exit block(s).
3264    ...
3265    */
3266 
3267   // Get the metadata of the original loop before it gets modified.
3268   MDNode *OrigLoopID = OrigLoop->getLoopID();
3269 
3270   // Workaround!  Compute the trip count of the original loop and cache it
3271   // before we start modifying the CFG.  This code has a systemic problem
3272   // wherein it tries to run analysis over partially constructed IR; this is
3273   // wrong, and not simply for SCEV.  The trip count of the original loop
3274   // simply happens to be prone to hitting this in practice.  In theory, we
3275   // can hit the same issue for any SCEV, or ValueTracking query done during
3276   // mutation.  See PR49900.
3277   getOrCreateTripCount(OrigLoop->getLoopPreheader());
3278 
3279   // Create an empty vector loop, and prepare basic blocks for the runtime
3280   // checks.
3281   createVectorLoopSkeleton("");
3282 
3283   // Now, compare the new count to zero. If it is zero skip the vector loop and
3284   // jump to the scalar loop. This check also covers the case where the
3285   // backedge-taken count is uint##_max: adding one to it will overflow leading
3286   // to an incorrect trip count of zero. In this (rare) case we will also jump
3287   // to the scalar loop.
3288   emitIterationCountCheck(LoopScalarPreHeader);
3289 
3290   // Generate the code to check any assumptions that we've made for SCEV
3291   // expressions.
3292   emitSCEVChecks(LoopScalarPreHeader);
3293 
3294   // Generate the code that checks in runtime if arrays overlap. We put the
3295   // checks into a separate block to make the more common case of few elements
3296   // faster.
3297   emitMemRuntimeChecks(LoopScalarPreHeader);
3298 
3299   // Emit phis for the new starting index of the scalar loop.
3300   createInductionResumeValues();
3301 
3302   return {completeLoopSkeleton(OrigLoopID), nullptr};
3303 }
3304 
3305 // Fix up external users of the induction variable. At this point, we are
3306 // in LCSSA form, with all external PHIs that use the IV having one input value,
3307 // coming from the remainder loop. We need those PHIs to also have a correct
3308 // value for the IV when arriving directly from the middle block.
3309 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3310                                        const InductionDescriptor &II,
3311                                        Value *VectorTripCount, Value *EndValue,
3312                                        BasicBlock *MiddleBlock,
3313                                        BasicBlock *VectorHeader, VPlan &Plan) {
3314   // There are two kinds of external IV usages - those that use the value
3315   // computed in the last iteration (the PHI) and those that use the penultimate
3316   // value (the value that feeds into the phi from the loop latch).
3317   // We allow both, but they, obviously, have different values.
3318 
3319   assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3320 
3321   DenseMap<Value *, Value *> MissingVals;
3322 
3323   // An external user of the last iteration's value should see the value that
3324   // the remainder loop uses to initialize its own IV.
3325   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3326   for (User *U : PostInc->users()) {
3327     Instruction *UI = cast<Instruction>(U);
3328     if (!OrigLoop->contains(UI)) {
3329       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3330       MissingVals[UI] = EndValue;
3331     }
3332   }
3333 
3334   // An external user of the penultimate value need to see EndValue - Step.
3335   // The simplest way to get this is to recompute it from the constituent SCEVs,
3336   // that is Start + (Step * (CRD - 1)).
3337   for (User *U : OrigPhi->users()) {
3338     auto *UI = cast<Instruction>(U);
3339     if (!OrigLoop->contains(UI)) {
3340       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3341 
3342       IRBuilder<> B(MiddleBlock->getTerminator());
3343 
3344       // Fast-math-flags propagate from the original induction instruction.
3345       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3346         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3347 
3348       Value *CountMinusOne = B.CreateSub(
3349           VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1));
3350       Value *CMO =
3351           !II.getStep()->getType()->isIntegerTy()
3352               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3353                              II.getStep()->getType())
3354               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3355       CMO->setName("cast.cmo");
3356 
3357       Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(),
3358                                     VectorHeader->getTerminator());
3359       Value *Escape =
3360           emitTransformedIndex(B, CMO, II.getStartValue(), Step, II);
3361       Escape->setName("ind.escape");
3362       MissingVals[UI] = Escape;
3363     }
3364   }
3365 
3366   for (auto &I : MissingVals) {
3367     PHINode *PHI = cast<PHINode>(I.first);
3368     // One corner case we have to handle is two IVs "chasing" each-other,
3369     // that is %IV2 = phi [...], [ %IV1, %latch ]
3370     // In this case, if IV1 has an external use, we need to avoid adding both
3371     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3372     // don't already have an incoming value for the middle block.
3373     if (PHI->getBasicBlockIndex(MiddleBlock) == -1) {
3374       PHI->addIncoming(I.second, MiddleBlock);
3375       Plan.removeLiveOut(PHI);
3376     }
3377   }
3378 }
3379 
3380 namespace {
3381 
3382 struct CSEDenseMapInfo {
3383   static bool canHandle(const Instruction *I) {
3384     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3385            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3386   }
3387 
3388   static inline Instruction *getEmptyKey() {
3389     return DenseMapInfo<Instruction *>::getEmptyKey();
3390   }
3391 
3392   static inline Instruction *getTombstoneKey() {
3393     return DenseMapInfo<Instruction *>::getTombstoneKey();
3394   }
3395 
3396   static unsigned getHashValue(const Instruction *I) {
3397     assert(canHandle(I) && "Unknown instruction!");
3398     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3399                                                            I->value_op_end()));
3400   }
3401 
3402   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3403     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3404         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3405       return LHS == RHS;
3406     return LHS->isIdenticalTo(RHS);
3407   }
3408 };
3409 
3410 } // end anonymous namespace
3411 
3412 ///Perform cse of induction variable instructions.
3413 static void cse(BasicBlock *BB) {
3414   // Perform simple cse.
3415   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3416   for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3417     if (!CSEDenseMapInfo::canHandle(&In))
3418       continue;
3419 
3420     // Check if we can replace this instruction with any of the
3421     // visited instructions.
3422     if (Instruction *V = CSEMap.lookup(&In)) {
3423       In.replaceAllUsesWith(V);
3424       In.eraseFromParent();
3425       continue;
3426     }
3427 
3428     CSEMap[&In] = &In;
3429   }
3430 }
3431 
3432 InstructionCost
3433 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
3434                                               bool &NeedToScalarize) const {
3435   Function *F = CI->getCalledFunction();
3436   Type *ScalarRetTy = CI->getType();
3437   SmallVector<Type *, 4> Tys, ScalarTys;
3438   for (auto &ArgOp : CI->args())
3439     ScalarTys.push_back(ArgOp->getType());
3440 
3441   // Estimate cost of scalarized vector call. The source operands are assumed
3442   // to be vectors, so we need to extract individual elements from there,
3443   // execute VF scalar calls, and then gather the result into the vector return
3444   // value.
3445   InstructionCost ScalarCallCost =
3446       TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
3447   if (VF.isScalar())
3448     return ScalarCallCost;
3449 
3450   // Compute corresponding vector type for return value and arguments.
3451   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3452   for (Type *ScalarTy : ScalarTys)
3453     Tys.push_back(ToVectorTy(ScalarTy, VF));
3454 
3455   // Compute costs of unpacking argument values for the scalar calls and
3456   // packing the return values to a vector.
3457   InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
3458 
3459   InstructionCost Cost =
3460       ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3461 
3462   // If we can't emit a vector call for this function, then the currently found
3463   // cost is the cost we need to return.
3464   NeedToScalarize = true;
3465   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3466   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3467 
3468   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3469     return Cost;
3470 
3471   // If the corresponding vector cost is cheaper, return its cost.
3472   InstructionCost VectorCallCost =
3473       TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
3474   if (VectorCallCost < Cost) {
3475     NeedToScalarize = false;
3476     Cost = VectorCallCost;
3477   }
3478   return Cost;
3479 }
3480 
3481 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3482   if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3483     return Elt;
3484   return VectorType::get(Elt, VF);
3485 }
3486 
3487 InstructionCost
3488 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3489                                                    ElementCount VF) const {
3490   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3491   assert(ID && "Expected intrinsic call!");
3492   Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3493   FastMathFlags FMF;
3494   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3495     FMF = FPMO->getFastMathFlags();
3496 
3497   SmallVector<const Value *> Arguments(CI->args());
3498   FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3499   SmallVector<Type *> ParamTys;
3500   std::transform(FTy->param_begin(), FTy->param_end(),
3501                  std::back_inserter(ParamTys),
3502                  [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3503 
3504   IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3505                                     dyn_cast<IntrinsicInst>(CI));
3506   return TTI.getIntrinsicInstrCost(CostAttrs,
3507                                    TargetTransformInfo::TCK_RecipThroughput);
3508 }
3509 
3510 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3511   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3512   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3513   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3514 }
3515 
3516 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3517   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3518   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3519   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3520 }
3521 
3522 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
3523   // For every instruction `I` in MinBWs, truncate the operands, create a
3524   // truncated version of `I` and reextend its result. InstCombine runs
3525   // later and will remove any ext/trunc pairs.
3526   SmallPtrSet<Value *, 4> Erased;
3527   for (const auto &KV : Cost->getMinimalBitwidths()) {
3528     // If the value wasn't vectorized, we must maintain the original scalar
3529     // type. The absence of the value from State indicates that it
3530     // wasn't vectorized.
3531     // FIXME: Should not rely on getVPValue at this point.
3532     VPValue *Def = State.Plan->getVPValue(KV.first, true);
3533     if (!State.hasAnyVectorValue(Def))
3534       continue;
3535     for (unsigned Part = 0; Part < UF; ++Part) {
3536       Value *I = State.get(Def, Part);
3537       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3538         continue;
3539       Type *OriginalTy = I->getType();
3540       Type *ScalarTruncatedTy =
3541           IntegerType::get(OriginalTy->getContext(), KV.second);
3542       auto *TruncatedTy = VectorType::get(
3543           ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount());
3544       if (TruncatedTy == OriginalTy)
3545         continue;
3546 
3547       IRBuilder<> B(cast<Instruction>(I));
3548       auto ShrinkOperand = [&](Value *V) -> Value * {
3549         if (auto *ZI = dyn_cast<ZExtInst>(V))
3550           if (ZI->getSrcTy() == TruncatedTy)
3551             return ZI->getOperand(0);
3552         return B.CreateZExtOrTrunc(V, TruncatedTy);
3553       };
3554 
3555       // The actual instruction modification depends on the instruction type,
3556       // unfortunately.
3557       Value *NewI = nullptr;
3558       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3559         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3560                              ShrinkOperand(BO->getOperand(1)));
3561 
3562         // Any wrapping introduced by shrinking this operation shouldn't be
3563         // considered undefined behavior. So, we can't unconditionally copy
3564         // arithmetic wrapping flags to NewI.
3565         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3566       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3567         NewI =
3568             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3569                          ShrinkOperand(CI->getOperand(1)));
3570       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3571         NewI = B.CreateSelect(SI->getCondition(),
3572                               ShrinkOperand(SI->getTrueValue()),
3573                               ShrinkOperand(SI->getFalseValue()));
3574       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3575         switch (CI->getOpcode()) {
3576         default:
3577           llvm_unreachable("Unhandled cast!");
3578         case Instruction::Trunc:
3579           NewI = ShrinkOperand(CI->getOperand(0));
3580           break;
3581         case Instruction::SExt:
3582           NewI = B.CreateSExtOrTrunc(
3583               CI->getOperand(0),
3584               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3585           break;
3586         case Instruction::ZExt:
3587           NewI = B.CreateZExtOrTrunc(
3588               CI->getOperand(0),
3589               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3590           break;
3591         }
3592       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3593         auto Elements0 =
3594             cast<VectorType>(SI->getOperand(0)->getType())->getElementCount();
3595         auto *O0 = B.CreateZExtOrTrunc(
3596             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3597         auto Elements1 =
3598             cast<VectorType>(SI->getOperand(1)->getType())->getElementCount();
3599         auto *O1 = B.CreateZExtOrTrunc(
3600             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3601 
3602         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3603       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3604         // Don't do anything with the operands, just extend the result.
3605         continue;
3606       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3607         auto Elements =
3608             cast<VectorType>(IE->getOperand(0)->getType())->getElementCount();
3609         auto *O0 = B.CreateZExtOrTrunc(
3610             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3611         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3612         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3613       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3614         auto Elements =
3615             cast<VectorType>(EE->getOperand(0)->getType())->getElementCount();
3616         auto *O0 = B.CreateZExtOrTrunc(
3617             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3618         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3619       } else {
3620         // If we don't know what to do, be conservative and don't do anything.
3621         continue;
3622       }
3623 
3624       // Lastly, extend the result.
3625       NewI->takeName(cast<Instruction>(I));
3626       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3627       I->replaceAllUsesWith(Res);
3628       cast<Instruction>(I)->eraseFromParent();
3629       Erased.insert(I);
3630       State.reset(Def, Res, Part);
3631     }
3632   }
3633 
3634   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3635   for (const auto &KV : Cost->getMinimalBitwidths()) {
3636     // If the value wasn't vectorized, we must maintain the original scalar
3637     // type. The absence of the value from State indicates that it
3638     // wasn't vectorized.
3639     // FIXME: Should not rely on getVPValue at this point.
3640     VPValue *Def = State.Plan->getVPValue(KV.first, true);
3641     if (!State.hasAnyVectorValue(Def))
3642       continue;
3643     for (unsigned Part = 0; Part < UF; ++Part) {
3644       Value *I = State.get(Def, Part);
3645       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3646       if (Inst && Inst->use_empty()) {
3647         Value *NewI = Inst->getOperand(0);
3648         Inst->eraseFromParent();
3649         State.reset(Def, NewI, Part);
3650       }
3651     }
3652   }
3653 }
3654 
3655 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
3656                                             VPlan &Plan) {
3657   // Insert truncates and extends for any truncated instructions as hints to
3658   // InstCombine.
3659   if (VF.isVector())
3660     truncateToMinimalBitwidths(State);
3661 
3662   // Fix widened non-induction PHIs by setting up the PHI operands.
3663   if (EnableVPlanNativePath)
3664     fixNonInductionPHIs(Plan, State);
3665 
3666   // At this point every instruction in the original loop is widened to a
3667   // vector form. Now we need to fix the recurrences in the loop. These PHI
3668   // nodes are currently empty because we did not want to introduce cycles.
3669   // This is the second stage of vectorizing recurrences.
3670   fixCrossIterationPHIs(State);
3671 
3672   // Forget the original basic block.
3673   PSE.getSE()->forgetLoop(OrigLoop);
3674 
3675   VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock();
3676   Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
3677   if (Cost->requiresScalarEpilogue(VF)) {
3678     // No edge from the middle block to the unique exit block has been inserted
3679     // and there is nothing to fix from vector loop; phis should have incoming
3680     // from scalar loop only.
3681     Plan.clearLiveOuts();
3682   } else {
3683     // If we inserted an edge from the middle block to the unique exit block,
3684     // update uses outside the loop (phis) to account for the newly inserted
3685     // edge.
3686 
3687     // Fix-up external users of the induction variables.
3688     for (auto &Entry : Legal->getInductionVars())
3689       fixupIVUsers(Entry.first, Entry.second,
3690                    getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()),
3691                    IVEndValues[Entry.first], LoopMiddleBlock,
3692                    VectorLoop->getHeader(), Plan);
3693   }
3694 
3695   // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated
3696   // in the exit block, so update the builder.
3697   State.Builder.SetInsertPoint(State.CFG.ExitBB->getFirstNonPHI());
3698   for (auto &KV : Plan.getLiveOuts())
3699     KV.second->fixPhi(Plan, State);
3700 
3701   for (Instruction *PI : PredicatedInstructions)
3702     sinkScalarOperands(&*PI);
3703 
3704   // Remove redundant induction instructions.
3705   cse(VectorLoop->getHeader());
3706 
3707   // Set/update profile weights for the vector and remainder loops as original
3708   // loop iterations are now distributed among them. Note that original loop
3709   // represented by LoopScalarBody becomes remainder loop after vectorization.
3710   //
3711   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3712   // end up getting slightly roughened result but that should be OK since
3713   // profile is not inherently precise anyway. Note also possible bypass of
3714   // vector code caused by legality checks is ignored, assigning all the weight
3715   // to the vector loop, optimistically.
3716   //
3717   // For scalable vectorization we can't know at compile time how many iterations
3718   // of the loop are handled in one vector iteration, so instead assume a pessimistic
3719   // vscale of '1'.
3720   setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop,
3721                                LI->getLoopFor(LoopScalarBody),
3722                                VF.getKnownMinValue() * UF);
3723 }
3724 
3725 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
3726   // In order to support recurrences we need to be able to vectorize Phi nodes.
3727   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3728   // stage #2: We now need to fix the recurrences by adding incoming edges to
3729   // the currently empty PHI nodes. At this point every instruction in the
3730   // original loop is widened to a vector form so we can use them to construct
3731   // the incoming edges.
3732   VPBasicBlock *Header =
3733       State.Plan->getVectorLoopRegion()->getEntryBasicBlock();
3734   for (VPRecipeBase &R : Header->phis()) {
3735     if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
3736       fixReduction(ReductionPhi, State);
3737     else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
3738       fixFirstOrderRecurrence(FOR, State);
3739   }
3740 }
3741 
3742 void InnerLoopVectorizer::fixFirstOrderRecurrence(
3743     VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {
3744   // This is the second phase of vectorizing first-order recurrences. An
3745   // overview of the transformation is described below. Suppose we have the
3746   // following loop.
3747   //
3748   //   for (int i = 0; i < n; ++i)
3749   //     b[i] = a[i] - a[i - 1];
3750   //
3751   // There is a first-order recurrence on "a". For this loop, the shorthand
3752   // scalar IR looks like:
3753   //
3754   //   scalar.ph:
3755   //     s_init = a[-1]
3756   //     br scalar.body
3757   //
3758   //   scalar.body:
3759   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3760   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3761   //     s2 = a[i]
3762   //     b[i] = s2 - s1
3763   //     br cond, scalar.body, ...
3764   //
3765   // In this example, s1 is a recurrence because it's value depends on the
3766   // previous iteration. In the first phase of vectorization, we created a
3767   // vector phi v1 for s1. We now complete the vectorization and produce the
3768   // shorthand vector IR shown below (for VF = 4, UF = 1).
3769   //
3770   //   vector.ph:
3771   //     v_init = vector(..., ..., ..., a[-1])
3772   //     br vector.body
3773   //
3774   //   vector.body
3775   //     i = phi [0, vector.ph], [i+4, vector.body]
3776   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3777   //     v2 = a[i, i+1, i+2, i+3];
3778   //     v3 = vector(v1(3), v2(0, 1, 2))
3779   //     b[i, i+1, i+2, i+3] = v2 - v3
3780   //     br cond, vector.body, middle.block
3781   //
3782   //   middle.block:
3783   //     x = v2(3)
3784   //     br scalar.ph
3785   //
3786   //   scalar.ph:
3787   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3788   //     br scalar.body
3789   //
3790   // After execution completes the vector loop, we extract the next value of
3791   // the recurrence (x) to use as the initial value in the scalar loop.
3792 
3793   // Extract the last vector element in the middle block. This will be the
3794   // initial value for the recurrence when jumping to the scalar loop.
3795   VPValue *PreviousDef = PhiR->getBackedgeValue();
3796   Value *Incoming = State.get(PreviousDef, UF - 1);
3797   auto *ExtractForScalar = Incoming;
3798   auto *IdxTy = Builder.getInt32Ty();
3799   if (VF.isVector()) {
3800     auto *One = ConstantInt::get(IdxTy, 1);
3801     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3802     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3803     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
3804     ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx,
3805                                                     "vector.recur.extract");
3806   }
3807   // Extract the second last element in the middle block if the
3808   // Phi is used outside the loop. We need to extract the phi itself
3809   // and not the last element (the phi update in the current iteration). This
3810   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3811   // when the scalar loop is not run at all.
3812   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3813   if (VF.isVector()) {
3814     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3815     auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
3816     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3817         Incoming, Idx, "vector.recur.extract.for.phi");
3818   } else if (UF > 1)
3819     // When loop is unrolled without vectorizing, initialize
3820     // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value
3821     // of `Incoming`. This is analogous to the vectorized case above: extracting
3822     // the second last element when VF > 1.
3823     ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
3824 
3825   // Fix the initial value of the original recurrence in the scalar loop.
3826   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3827   PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
3828   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3829   auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
3830   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3831     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3832     Start->addIncoming(Incoming, BB);
3833   }
3834 
3835   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3836   Phi->setName("scalar.recur");
3837 
3838   // Finally, fix users of the recurrence outside the loop. The users will need
3839   // either the last value of the scalar recurrence or the last value of the
3840   // vector recurrence we extracted in the middle block. Since the loop is in
3841   // LCSSA form, we just need to find all the phi nodes for the original scalar
3842   // recurrence in the exit block, and then add an edge for the middle block.
3843   // Note that LCSSA does not imply single entry when the original scalar loop
3844   // had multiple exiting edges (as we always run the last iteration in the
3845   // scalar epilogue); in that case, there is no edge from middle to exit and
3846   // and thus no phis which needed updated.
3847   if (!Cost->requiresScalarEpilogue(VF))
3848     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
3849       if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) {
3850         LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3851         State.Plan->removeLiveOut(&LCSSAPhi);
3852       }
3853 }
3854 
3855 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
3856                                        VPTransformState &State) {
3857   PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
3858   // Get it's reduction variable descriptor.
3859   assert(Legal->isReductionVariable(OrigPhi) &&
3860          "Unable to find the reduction variable");
3861   const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
3862 
3863   RecurKind RK = RdxDesc.getRecurrenceKind();
3864   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3865   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3866   setDebugLocFromInst(ReductionStartValue);
3867 
3868   VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
3869   // This is the vector-clone of the value that leaves the loop.
3870   Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
3871 
3872   // Wrap flags are in general invalid after vectorization, clear them.
3873   clearReductionWrapFlags(PhiR, State);
3874 
3875   // Before each round, move the insertion point right between
3876   // the PHIs and the values we are going to write.
3877   // This allows us to write both PHINodes and the extractelement
3878   // instructions.
3879   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3880 
3881   setDebugLocFromInst(LoopExitInst);
3882 
3883   Type *PhiTy = OrigPhi->getType();
3884 
3885   VPBasicBlock *LatchVPBB =
3886       PhiR->getParent()->getEnclosingLoopRegion()->getExitingBasicBlock();
3887   BasicBlock *VectorLoopLatch = State.CFG.VPBB2IRBB[LatchVPBB];
3888   // If tail is folded by masking, the vector value to leave the loop should be
3889   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3890   // instead of the former. For an inloop reduction the reduction will already
3891   // be predicated, and does not need to be handled here.
3892   if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
3893     for (unsigned Part = 0; Part < UF; ++Part) {
3894       Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
3895       SelectInst *Sel = nullptr;
3896       for (User *U : VecLoopExitInst->users()) {
3897         if (isa<SelectInst>(U)) {
3898           assert(!Sel && "Reduction exit feeding two selects");
3899           Sel = cast<SelectInst>(U);
3900         } else
3901           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3902       }
3903       assert(Sel && "Reduction exit feeds no select");
3904       State.reset(LoopExitInstDef, Sel, Part);
3905 
3906       if (isa<FPMathOperator>(Sel))
3907         Sel->setFastMathFlags(RdxDesc.getFastMathFlags());
3908 
3909       // If the target can create a predicated operator for the reduction at no
3910       // extra cost in the loop (for example a predicated vadd), it can be
3911       // cheaper for the select to remain in the loop than be sunk out of it,
3912       // and so use the select value for the phi instead of the old
3913       // LoopExitValue.
3914       if (PreferPredicatedReductionSelect ||
3915           TTI->preferPredicatedReductionSelect(
3916               RdxDesc.getOpcode(), PhiTy,
3917               TargetTransformInfo::ReductionFlags())) {
3918         auto *VecRdxPhi =
3919             cast<PHINode>(State.get(PhiR, Part));
3920         VecRdxPhi->setIncomingValueForBlock(VectorLoopLatch, Sel);
3921       }
3922     }
3923   }
3924 
3925   // If the vector reduction can be performed in a smaller type, we truncate
3926   // then extend the loop exit value to enable InstCombine to evaluate the
3927   // entire expression in the smaller type.
3928   if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
3929     assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
3930     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3931     Builder.SetInsertPoint(VectorLoopLatch->getTerminator());
3932     VectorParts RdxParts(UF);
3933     for (unsigned Part = 0; Part < UF; ++Part) {
3934       RdxParts[Part] = State.get(LoopExitInstDef, Part);
3935       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3936       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3937                                         : Builder.CreateZExt(Trunc, VecTy);
3938       for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users()))
3939         if (U != Trunc) {
3940           U->replaceUsesOfWith(RdxParts[Part], Extnd);
3941           RdxParts[Part] = Extnd;
3942         }
3943     }
3944     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3945     for (unsigned Part = 0; Part < UF; ++Part) {
3946       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3947       State.reset(LoopExitInstDef, RdxParts[Part], Part);
3948     }
3949   }
3950 
3951   // Reduce all of the unrolled parts into a single vector.
3952   Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
3953   unsigned Op = RecurrenceDescriptor::getOpcode(RK);
3954 
3955   // The middle block terminator has already been assigned a DebugLoc here (the
3956   // OrigLoop's single latch terminator). We want the whole middle block to
3957   // appear to execute on this line because: (a) it is all compiler generated,
3958   // (b) these instructions are always executed after evaluating the latch
3959   // conditional branch, and (c) other passes may add new predecessors which
3960   // terminate on this line. This is the easiest way to ensure we don't
3961   // accidentally cause an extra step back into the loop while debugging.
3962   setDebugLocFromInst(LoopMiddleBlock->getTerminator());
3963   if (PhiR->isOrdered())
3964     ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
3965   else {
3966     // Floating-point operations should have some FMF to enable the reduction.
3967     IRBuilderBase::FastMathFlagGuard FMFG(Builder);
3968     Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
3969     for (unsigned Part = 1; Part < UF; ++Part) {
3970       Value *RdxPart = State.get(LoopExitInstDef, Part);
3971       if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
3972         ReducedPartRdx = Builder.CreateBinOp(
3973             (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
3974       } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK))
3975         ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK,
3976                                            ReducedPartRdx, RdxPart);
3977       else
3978         ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
3979     }
3980   }
3981 
3982   // Create the reduction after the loop. Note that inloop reductions create the
3983   // target reduction in the loop using a Reduction recipe.
3984   if (VF.isVector() && !PhiR->isInLoop()) {
3985     ReducedPartRdx =
3986         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi);
3987     // If the reduction can be performed in a smaller type, we need to extend
3988     // the reduction to the wider type before we branch to the original loop.
3989     if (PhiTy != RdxDesc.getRecurrenceType())
3990       ReducedPartRdx = RdxDesc.isSigned()
3991                            ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
3992                            : Builder.CreateZExt(ReducedPartRdx, PhiTy);
3993   }
3994 
3995   PHINode *ResumePhi =
3996       dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
3997 
3998   // Create a phi node that merges control-flow from the backedge-taken check
3999   // block and the middle block.
4000   PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
4001                                         LoopScalarPreHeader->getTerminator());
4002 
4003   // If we are fixing reductions in the epilogue loop then we should already
4004   // have created a bc.merge.rdx Phi after the main vector body. Ensure that
4005   // we carry over the incoming values correctly.
4006   for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
4007     if (Incoming == LoopMiddleBlock)
4008       BCBlockPhi->addIncoming(ReducedPartRdx, Incoming);
4009     else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming))
4010       BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
4011                               Incoming);
4012     else
4013       BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
4014   }
4015 
4016   // Set the resume value for this reduction
4017   ReductionResumeValues.insert({&RdxDesc, BCBlockPhi});
4018 
4019   // If there were stores of the reduction value to a uniform memory address
4020   // inside the loop, create the final store here.
4021   if (StoreInst *SI = RdxDesc.IntermediateStore) {
4022     StoreInst *NewSI =
4023         Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand());
4024     propagateMetadata(NewSI, SI);
4025 
4026     // If the reduction value is used in other places,
4027     // then let the code below create PHI's for that.
4028   }
4029 
4030   // Now, we need to fix the users of the reduction variable
4031   // inside and outside of the scalar remainder loop.
4032 
4033   // We know that the loop is in LCSSA form. We need to update the PHI nodes
4034   // in the exit blocks.  See comment on analogous loop in
4035   // fixFirstOrderRecurrence for a more complete explaination of the logic.
4036   if (!Cost->requiresScalarEpilogue(VF))
4037     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4038       if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) {
4039         LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4040         State.Plan->removeLiveOut(&LCSSAPhi);
4041       }
4042 
4043   // Fix the scalar loop reduction variable with the incoming reduction sum
4044   // from the vector body and from the backedge value.
4045   int IncomingEdgeBlockIdx =
4046       OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4047   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4048   // Pick the other block.
4049   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4050   OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4051   OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4052 }
4053 
4054 void InnerLoopVectorizer::clearReductionWrapFlags(VPReductionPHIRecipe *PhiR,
4055                                                   VPTransformState &State) {
4056   const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
4057   RecurKind RK = RdxDesc.getRecurrenceKind();
4058   if (RK != RecurKind::Add && RK != RecurKind::Mul)
4059     return;
4060 
4061   SmallVector<VPValue *, 8> Worklist;
4062   SmallPtrSet<VPValue *, 8> Visited;
4063   Worklist.push_back(PhiR);
4064   Visited.insert(PhiR);
4065 
4066   while (!Worklist.empty()) {
4067     VPValue *Cur = Worklist.pop_back_val();
4068     for (unsigned Part = 0; Part < UF; ++Part) {
4069       Value *V = State.get(Cur, Part);
4070       if (!isa<OverflowingBinaryOperator>(V))
4071         break;
4072       cast<Instruction>(V)->dropPoisonGeneratingFlags();
4073       }
4074 
4075       for (VPUser *U : Cur->users()) {
4076         auto *UserRecipe = dyn_cast<VPRecipeBase>(U);
4077         if (!UserRecipe)
4078           continue;
4079         for (VPValue *V : UserRecipe->definedValues())
4080           if (Visited.insert(V).second)
4081             Worklist.push_back(V);
4082       }
4083   }
4084 }
4085 
4086 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4087   // The basic block and loop containing the predicated instruction.
4088   auto *PredBB = PredInst->getParent();
4089   auto *VectorLoop = LI->getLoopFor(PredBB);
4090 
4091   // Initialize a worklist with the operands of the predicated instruction.
4092   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4093 
4094   // Holds instructions that we need to analyze again. An instruction may be
4095   // reanalyzed if we don't yet know if we can sink it or not.
4096   SmallVector<Instruction *, 8> InstsToReanalyze;
4097 
4098   // Returns true if a given use occurs in the predicated block. Phi nodes use
4099   // their operands in their corresponding predecessor blocks.
4100   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4101     auto *I = cast<Instruction>(U.getUser());
4102     BasicBlock *BB = I->getParent();
4103     if (auto *Phi = dyn_cast<PHINode>(I))
4104       BB = Phi->getIncomingBlock(
4105           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4106     return BB == PredBB;
4107   };
4108 
4109   // Iteratively sink the scalarized operands of the predicated instruction
4110   // into the block we created for it. When an instruction is sunk, it's
4111   // operands are then added to the worklist. The algorithm ends after one pass
4112   // through the worklist doesn't sink a single instruction.
4113   bool Changed;
4114   do {
4115     // Add the instructions that need to be reanalyzed to the worklist, and
4116     // reset the changed indicator.
4117     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4118     InstsToReanalyze.clear();
4119     Changed = false;
4120 
4121     while (!Worklist.empty()) {
4122       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4123 
4124       // We can't sink an instruction if it is a phi node, is not in the loop,
4125       // or may have side effects.
4126       if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
4127           I->mayHaveSideEffects())
4128         continue;
4129 
4130       // If the instruction is already in PredBB, check if we can sink its
4131       // operands. In that case, VPlan's sinkScalarOperands() succeeded in
4132       // sinking the scalar instruction I, hence it appears in PredBB; but it
4133       // may have failed to sink I's operands (recursively), which we try
4134       // (again) here.
4135       if (I->getParent() == PredBB) {
4136         Worklist.insert(I->op_begin(), I->op_end());
4137         continue;
4138       }
4139 
4140       // It's legal to sink the instruction if all its uses occur in the
4141       // predicated block. Otherwise, there's nothing to do yet, and we may
4142       // need to reanalyze the instruction.
4143       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4144         InstsToReanalyze.push_back(I);
4145         continue;
4146       }
4147 
4148       // Move the instruction to the beginning of the predicated block, and add
4149       // it's operands to the worklist.
4150       I->moveBefore(&*PredBB->getFirstInsertionPt());
4151       Worklist.insert(I->op_begin(), I->op_end());
4152 
4153       // The sinking may have enabled other instructions to be sunk, so we will
4154       // need to iterate.
4155       Changed = true;
4156     }
4157   } while (Changed);
4158 }
4159 
4160 void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan,
4161                                               VPTransformState &State) {
4162   auto Iter = depth_first(
4163       VPBlockRecursiveTraversalWrapper<VPBlockBase *>(Plan.getEntry()));
4164   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
4165     for (VPRecipeBase &P : VPBB->phis()) {
4166       VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
4167       if (!VPPhi)
4168         continue;
4169       PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
4170       // Make sure the builder has a valid insert point.
4171       Builder.SetInsertPoint(NewPhi);
4172       for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
4173         VPValue *Inc = VPPhi->getIncomingValue(i);
4174         VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
4175         NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
4176       }
4177     }
4178   }
4179 }
4180 
4181 bool InnerLoopVectorizer::useOrderedReductions(
4182     const RecurrenceDescriptor &RdxDesc) {
4183   return Cost->useOrderedReductions(RdxDesc);
4184 }
4185 
4186 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
4187                                               VPWidenPHIRecipe *PhiR,
4188                                               VPTransformState &State) {
4189   assert(EnableVPlanNativePath &&
4190          "Non-native vplans are not expected to have VPWidenPHIRecipes.");
4191   // Currently we enter here in the VPlan-native path for non-induction
4192   // PHIs where all control flow is uniform. We simply widen these PHIs.
4193   // Create a vector phi with no operands - the vector phi operands will be
4194   // set at the end of vector code generation.
4195   Type *VecTy = (State.VF.isScalar())
4196                     ? PN->getType()
4197                     : VectorType::get(PN->getType(), State.VF);
4198   Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4199   State.set(PhiR, VecPhi, 0);
4200 }
4201 
4202 /// A helper function for checking whether an integer division-related
4203 /// instruction may divide by zero (in which case it must be predicated if
4204 /// executed conditionally in the scalar code).
4205 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4206 /// Non-zero divisors that are non compile-time constants will not be
4207 /// converted into multiplication, so we will still end up scalarizing
4208 /// the division, but can do so w/o predication.
4209 static bool mayDivideByZero(Instruction &I) {
4210   assert((I.getOpcode() == Instruction::UDiv ||
4211           I.getOpcode() == Instruction::SDiv ||
4212           I.getOpcode() == Instruction::URem ||
4213           I.getOpcode() == Instruction::SRem) &&
4214          "Unexpected instruction");
4215   Value *Divisor = I.getOperand(1);
4216   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4217   return !CInt || CInt->isZero();
4218 }
4219 
4220 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
4221                                                VPUser &ArgOperands,
4222                                                VPTransformState &State) {
4223   assert(!isa<DbgInfoIntrinsic>(I) &&
4224          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4225   setDebugLocFromInst(&I);
4226 
4227   Module *M = I.getParent()->getParent()->getParent();
4228   auto *CI = cast<CallInst>(&I);
4229 
4230   SmallVector<Type *, 4> Tys;
4231   for (Value *ArgOperand : CI->args())
4232     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4233 
4234   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4235 
4236   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4237   // version of the instruction.
4238   // Is it beneficial to perform intrinsic call compared to lib call?
4239   bool NeedToScalarize = false;
4240   InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4241   InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0;
4242   bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
4243   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4244          "Instruction should be scalarized elsewhere.");
4245   assert((IntrinsicCost.isValid() || CallCost.isValid()) &&
4246          "Either the intrinsic cost or vector call cost must be valid");
4247 
4248   for (unsigned Part = 0; Part < UF; ++Part) {
4249     SmallVector<Type *, 2> TysForDecl = {CI->getType()};
4250     SmallVector<Value *, 4> Args;
4251     for (auto &I : enumerate(ArgOperands.operands())) {
4252       // Some intrinsics have a scalar argument - don't replace it with a
4253       // vector.
4254       Value *Arg;
4255       if (!UseVectorIntrinsic ||
4256           !isVectorIntrinsicWithScalarOpAtArg(ID, I.index()))
4257         Arg = State.get(I.value(), Part);
4258       else
4259         Arg = State.get(I.value(), VPIteration(0, 0));
4260       if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I.index()))
4261         TysForDecl.push_back(Arg->getType());
4262       Args.push_back(Arg);
4263     }
4264 
4265     Function *VectorF;
4266     if (UseVectorIntrinsic) {
4267       // Use vector version of the intrinsic.
4268       if (VF.isVector())
4269         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4270       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4271       assert(VectorF && "Can't retrieve vector intrinsic.");
4272     } else {
4273       // Use vector version of the function call.
4274       const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
4275 #ifndef NDEBUG
4276       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
4277              "Can't create vector function.");
4278 #endif
4279         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4280     }
4281       SmallVector<OperandBundleDef, 1> OpBundles;
4282       CI->getOperandBundlesAsDefs(OpBundles);
4283       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4284 
4285       if (isa<FPMathOperator>(V))
4286         V->copyFastMathFlags(CI);
4287 
4288       State.set(Def, V, Part);
4289       addMetadata(V, &I);
4290   }
4291 }
4292 
4293 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4294   // We should not collect Scalars more than once per VF. Right now, this
4295   // function is called from collectUniformsAndScalars(), which already does
4296   // this check. Collecting Scalars for VF=1 does not make any sense.
4297   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
4298          "This function should not be visited twice for the same VF");
4299 
4300   // This avoids any chances of creating a REPLICATE recipe during planning
4301   // since that would result in generation of scalarized code during execution,
4302   // which is not supported for scalable vectors.
4303   if (VF.isScalable()) {
4304     Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
4305     return;
4306   }
4307 
4308   SmallSetVector<Instruction *, 8> Worklist;
4309 
4310   // These sets are used to seed the analysis with pointers used by memory
4311   // accesses that will remain scalar.
4312   SmallSetVector<Instruction *, 8> ScalarPtrs;
4313   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4314   auto *Latch = TheLoop->getLoopLatch();
4315 
4316   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4317   // The pointer operands of loads and stores will be scalar as long as the
4318   // memory access is not a gather or scatter operation. The value operand of a
4319   // store will remain scalar if the store is scalarized.
4320   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4321     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4322     assert(WideningDecision != CM_Unknown &&
4323            "Widening decision should be ready at this moment");
4324     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4325       if (Ptr == Store->getValueOperand())
4326         return WideningDecision == CM_Scalarize;
4327     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4328            "Ptr is neither a value or pointer operand");
4329     return WideningDecision != CM_GatherScatter;
4330   };
4331 
4332   // A helper that returns true if the given value is a bitcast or
4333   // getelementptr instruction contained in the loop.
4334   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4335     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4336             isa<GetElementPtrInst>(V)) &&
4337            !TheLoop->isLoopInvariant(V);
4338   };
4339 
4340   // A helper that evaluates a memory access's use of a pointer. If the use will
4341   // be a scalar use and the pointer is only used by memory accesses, we place
4342   // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4343   // PossibleNonScalarPtrs.
4344   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4345     // We only care about bitcast and getelementptr instructions contained in
4346     // the loop.
4347     if (!isLoopVaryingBitCastOrGEP(Ptr))
4348       return;
4349 
4350     // If the pointer has already been identified as scalar (e.g., if it was
4351     // also identified as uniform), there's nothing to do.
4352     auto *I = cast<Instruction>(Ptr);
4353     if (Worklist.count(I))
4354       return;
4355 
4356     // If the use of the pointer will be a scalar use, and all users of the
4357     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4358     // place the pointer in PossibleNonScalarPtrs.
4359     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4360           return isa<LoadInst>(U) || isa<StoreInst>(U);
4361         }))
4362       ScalarPtrs.insert(I);
4363     else
4364       PossibleNonScalarPtrs.insert(I);
4365   };
4366 
4367   // We seed the scalars analysis with three classes of instructions: (1)
4368   // instructions marked uniform-after-vectorization and (2) bitcast,
4369   // getelementptr and (pointer) phi instructions used by memory accesses
4370   // requiring a scalar use.
4371   //
4372   // (1) Add to the worklist all instructions that have been identified as
4373   // uniform-after-vectorization.
4374   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4375 
4376   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4377   // memory accesses requiring a scalar use. The pointer operands of loads and
4378   // stores will be scalar as long as the memory accesses is not a gather or
4379   // scatter operation. The value operand of a store will remain scalar if the
4380   // store is scalarized.
4381   for (auto *BB : TheLoop->blocks())
4382     for (auto &I : *BB) {
4383       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4384         evaluatePtrUse(Load, Load->getPointerOperand());
4385       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4386         evaluatePtrUse(Store, Store->getPointerOperand());
4387         evaluatePtrUse(Store, Store->getValueOperand());
4388       }
4389     }
4390   for (auto *I : ScalarPtrs)
4391     if (!PossibleNonScalarPtrs.count(I)) {
4392       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4393       Worklist.insert(I);
4394     }
4395 
4396   // Insert the forced scalars.
4397   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4398   // induction variable when the PHI user is scalarized.
4399   auto ForcedScalar = ForcedScalars.find(VF);
4400   if (ForcedScalar != ForcedScalars.end())
4401     for (auto *I : ForcedScalar->second)
4402       Worklist.insert(I);
4403 
4404   // Expand the worklist by looking through any bitcasts and getelementptr
4405   // instructions we've already identified as scalar. This is similar to the
4406   // expansion step in collectLoopUniforms(); however, here we're only
4407   // expanding to include additional bitcasts and getelementptr instructions.
4408   unsigned Idx = 0;
4409   while (Idx != Worklist.size()) {
4410     Instruction *Dst = Worklist[Idx++];
4411     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4412       continue;
4413     auto *Src = cast<Instruction>(Dst->getOperand(0));
4414     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4415           auto *J = cast<Instruction>(U);
4416           return !TheLoop->contains(J) || Worklist.count(J) ||
4417                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4418                   isScalarUse(J, Src));
4419         })) {
4420       Worklist.insert(Src);
4421       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4422     }
4423   }
4424 
4425   // An induction variable will remain scalar if all users of the induction
4426   // variable and induction variable update remain scalar.
4427   for (auto &Induction : Legal->getInductionVars()) {
4428     auto *Ind = Induction.first;
4429     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4430 
4431     // If tail-folding is applied, the primary induction variable will be used
4432     // to feed a vector compare.
4433     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4434       continue;
4435 
4436     // Returns true if \p Indvar is a pointer induction that is used directly by
4437     // load/store instruction \p I.
4438     auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
4439                                               Instruction *I) {
4440       return Induction.second.getKind() ==
4441                  InductionDescriptor::IK_PtrInduction &&
4442              (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
4443              Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
4444     };
4445 
4446     // Determine if all users of the induction variable are scalar after
4447     // vectorization.
4448     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4449       auto *I = cast<Instruction>(U);
4450       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4451              IsDirectLoadStoreFromPtrIndvar(Ind, I);
4452     });
4453     if (!ScalarInd)
4454       continue;
4455 
4456     // Determine if all users of the induction variable update instruction are
4457     // scalar after vectorization.
4458     auto ScalarIndUpdate =
4459         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4460           auto *I = cast<Instruction>(U);
4461           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4462                  IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
4463         });
4464     if (!ScalarIndUpdate)
4465       continue;
4466 
4467     // The induction variable and its update instruction will remain scalar.
4468     Worklist.insert(Ind);
4469     Worklist.insert(IndUpdate);
4470     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4471     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4472                       << "\n");
4473   }
4474 
4475   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4476 }
4477 
4478 bool LoopVectorizationCostModel::isScalarWithPredication(
4479     Instruction *I, ElementCount VF) const {
4480   if (!blockNeedsPredicationForAnyReason(I->getParent()))
4481     return false;
4482   switch(I->getOpcode()) {
4483   default:
4484     break;
4485   case Instruction::Load:
4486   case Instruction::Store: {
4487     if (!Legal->isMaskRequired(I))
4488       return false;
4489     auto *Ptr = getLoadStorePointerOperand(I);
4490     auto *Ty = getLoadStoreType(I);
4491     Type *VTy = Ty;
4492     if (VF.isVector())
4493       VTy = VectorType::get(Ty, VF);
4494     const Align Alignment = getLoadStoreAlignment(I);
4495     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4496                                 TTI.isLegalMaskedGather(VTy, Alignment))
4497                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4498                                 TTI.isLegalMaskedScatter(VTy, Alignment));
4499   }
4500   case Instruction::UDiv:
4501   case Instruction::SDiv:
4502   case Instruction::SRem:
4503   case Instruction::URem:
4504     return mayDivideByZero(*I);
4505   }
4506   return false;
4507 }
4508 
4509 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4510     Instruction *I, ElementCount VF) {
4511   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4512   assert(getWideningDecision(I, VF) == CM_Unknown &&
4513          "Decision should not be set yet.");
4514   auto *Group = getInterleavedAccessGroup(I);
4515   assert(Group && "Must have a group.");
4516 
4517   // If the instruction's allocated size doesn't equal it's type size, it
4518   // requires padding and will be scalarized.
4519   auto &DL = I->getModule()->getDataLayout();
4520   auto *ScalarTy = getLoadStoreType(I);
4521   if (hasIrregularType(ScalarTy, DL))
4522     return false;
4523 
4524   // If the group involves a non-integral pointer, we may not be able to
4525   // losslessly cast all values to a common type.
4526   unsigned InterleaveFactor = Group->getFactor();
4527   bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
4528   for (unsigned i = 0; i < InterleaveFactor; i++) {
4529     Instruction *Member = Group->getMember(i);
4530     if (!Member)
4531       continue;
4532     auto *MemberTy = getLoadStoreType(Member);
4533     bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
4534     // Don't coerce non-integral pointers to integers or vice versa.
4535     if (MemberNI != ScalarNI) {
4536       // TODO: Consider adding special nullptr value case here
4537       return false;
4538     } else if (MemberNI && ScalarNI &&
4539                ScalarTy->getPointerAddressSpace() !=
4540                MemberTy->getPointerAddressSpace()) {
4541       return false;
4542     }
4543   }
4544 
4545   // Check if masking is required.
4546   // A Group may need masking for one of two reasons: it resides in a block that
4547   // needs predication, or it was decided to use masking to deal with gaps
4548   // (either a gap at the end of a load-access that may result in a speculative
4549   // load, or any gaps in a store-access).
4550   bool PredicatedAccessRequiresMasking =
4551       blockNeedsPredicationForAnyReason(I->getParent()) &&
4552       Legal->isMaskRequired(I);
4553   bool LoadAccessWithGapsRequiresEpilogMasking =
4554       isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
4555       !isScalarEpilogueAllowed();
4556   bool StoreAccessWithGapsRequiresMasking =
4557       isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
4558   if (!PredicatedAccessRequiresMasking &&
4559       !LoadAccessWithGapsRequiresEpilogMasking &&
4560       !StoreAccessWithGapsRequiresMasking)
4561     return true;
4562 
4563   // If masked interleaving is required, we expect that the user/target had
4564   // enabled it, because otherwise it either wouldn't have been created or
4565   // it should have been invalidated by the CostModel.
4566   assert(useMaskedInterleavedAccesses(TTI) &&
4567          "Masked interleave-groups for predicated accesses are not enabled.");
4568 
4569   if (Group->isReverse())
4570     return false;
4571 
4572   auto *Ty = getLoadStoreType(I);
4573   const Align Alignment = getLoadStoreAlignment(I);
4574   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4575                           : TTI.isLegalMaskedStore(Ty, Alignment);
4576 }
4577 
4578 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4579     Instruction *I, ElementCount VF) {
4580   // Get and ensure we have a valid memory instruction.
4581   assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
4582 
4583   auto *Ptr = getLoadStorePointerOperand(I);
4584   auto *ScalarTy = getLoadStoreType(I);
4585 
4586   // In order to be widened, the pointer should be consecutive, first of all.
4587   if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
4588     return false;
4589 
4590   // If the instruction is a store located in a predicated block, it will be
4591   // scalarized.
4592   if (isScalarWithPredication(I, VF))
4593     return false;
4594 
4595   // If the instruction's allocated size doesn't equal it's type size, it
4596   // requires padding and will be scalarized.
4597   auto &DL = I->getModule()->getDataLayout();
4598   if (hasIrregularType(ScalarTy, DL))
4599     return false;
4600 
4601   return true;
4602 }
4603 
4604 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
4605   // We should not collect Uniforms more than once per VF. Right now,
4606   // this function is called from collectUniformsAndScalars(), which
4607   // already does this check. Collecting Uniforms for VF=1 does not make any
4608   // sense.
4609 
4610   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
4611          "This function should not be visited twice for the same VF");
4612 
4613   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4614   // not analyze again.  Uniforms.count(VF) will return 1.
4615   Uniforms[VF].clear();
4616 
4617   // We now know that the loop is vectorizable!
4618   // Collect instructions inside the loop that will remain uniform after
4619   // vectorization.
4620 
4621   // Global values, params and instructions outside of current loop are out of
4622   // scope.
4623   auto isOutOfScope = [&](Value *V) -> bool {
4624     Instruction *I = dyn_cast<Instruction>(V);
4625     return (!I || !TheLoop->contains(I));
4626   };
4627 
4628   // Worklist containing uniform instructions demanding lane 0.
4629   SetVector<Instruction *> Worklist;
4630   BasicBlock *Latch = TheLoop->getLoopLatch();
4631 
4632   // Add uniform instructions demanding lane 0 to the worklist. Instructions
4633   // that are scalar with predication must not be considered uniform after
4634   // vectorization, because that would create an erroneous replicating region
4635   // where only a single instance out of VF should be formed.
4636   // TODO: optimize such seldom cases if found important, see PR40816.
4637   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4638     if (isOutOfScope(I)) {
4639       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
4640                         << *I << "\n");
4641       return;
4642     }
4643     if (isScalarWithPredication(I, VF)) {
4644       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4645                         << *I << "\n");
4646       return;
4647     }
4648     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4649     Worklist.insert(I);
4650   };
4651 
4652   // Start with the conditional branch. If the branch condition is an
4653   // instruction contained in the loop that is only used by the branch, it is
4654   // uniform.
4655   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4656   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4657     addToWorklistIfAllowed(Cmp);
4658 
4659   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
4660     InstWidening WideningDecision = getWideningDecision(I, VF);
4661     assert(WideningDecision != CM_Unknown &&
4662            "Widening decision should be ready at this moment");
4663 
4664     // A uniform memory op is itself uniform.  We exclude uniform stores
4665     // here as they demand the last lane, not the first one.
4666     if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
4667       assert(WideningDecision == CM_Scalarize);
4668       return true;
4669     }
4670 
4671     return (WideningDecision == CM_Widen ||
4672             WideningDecision == CM_Widen_Reverse ||
4673             WideningDecision == CM_Interleave);
4674   };
4675 
4676 
4677   // Returns true if Ptr is the pointer operand of a memory access instruction
4678   // I, and I is known to not require scalarization.
4679   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4680     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4681   };
4682 
4683   // Holds a list of values which are known to have at least one uniform use.
4684   // Note that there may be other uses which aren't uniform.  A "uniform use"
4685   // here is something which only demands lane 0 of the unrolled iterations;
4686   // it does not imply that all lanes produce the same value (e.g. this is not
4687   // the usual meaning of uniform)
4688   SetVector<Value *> HasUniformUse;
4689 
4690   // Scan the loop for instructions which are either a) known to have only
4691   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
4692   for (auto *BB : TheLoop->blocks())
4693     for (auto &I : *BB) {
4694       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
4695         switch (II->getIntrinsicID()) {
4696         case Intrinsic::sideeffect:
4697         case Intrinsic::experimental_noalias_scope_decl:
4698         case Intrinsic::assume:
4699         case Intrinsic::lifetime_start:
4700         case Intrinsic::lifetime_end:
4701           if (TheLoop->hasLoopInvariantOperands(&I))
4702             addToWorklistIfAllowed(&I);
4703           break;
4704         default:
4705           break;
4706         }
4707       }
4708 
4709       // ExtractValue instructions must be uniform, because the operands are
4710       // known to be loop-invariant.
4711       if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
4712         assert(isOutOfScope(EVI->getAggregateOperand()) &&
4713                "Expected aggregate value to be loop invariant");
4714         addToWorklistIfAllowed(EVI);
4715         continue;
4716       }
4717 
4718       // If there's no pointer operand, there's nothing to do.
4719       auto *Ptr = getLoadStorePointerOperand(&I);
4720       if (!Ptr)
4721         continue;
4722 
4723       // A uniform memory op is itself uniform.  We exclude uniform stores
4724       // here as they demand the last lane, not the first one.
4725       if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
4726         addToWorklistIfAllowed(&I);
4727 
4728       if (isUniformDecision(&I, VF)) {
4729         assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
4730         HasUniformUse.insert(Ptr);
4731       }
4732     }
4733 
4734   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
4735   // demanding) users.  Since loops are assumed to be in LCSSA form, this
4736   // disallows uses outside the loop as well.
4737   for (auto *V : HasUniformUse) {
4738     if (isOutOfScope(V))
4739       continue;
4740     auto *I = cast<Instruction>(V);
4741     auto UsersAreMemAccesses =
4742       llvm::all_of(I->users(), [&](User *U) -> bool {
4743         return isVectorizedMemAccessUse(cast<Instruction>(U), V);
4744       });
4745     if (UsersAreMemAccesses)
4746       addToWorklistIfAllowed(I);
4747   }
4748 
4749   // Expand Worklist in topological order: whenever a new instruction
4750   // is added , its users should be already inside Worklist.  It ensures
4751   // a uniform instruction will only be used by uniform instructions.
4752   unsigned idx = 0;
4753   while (idx != Worklist.size()) {
4754     Instruction *I = Worklist[idx++];
4755 
4756     for (auto OV : I->operand_values()) {
4757       // isOutOfScope operands cannot be uniform instructions.
4758       if (isOutOfScope(OV))
4759         continue;
4760       // First order recurrence Phi's should typically be considered
4761       // non-uniform.
4762       auto *OP = dyn_cast<PHINode>(OV);
4763       if (OP && Legal->isFirstOrderRecurrence(OP))
4764         continue;
4765       // If all the users of the operand are uniform, then add the
4766       // operand into the uniform worklist.
4767       auto *OI = cast<Instruction>(OV);
4768       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4769             auto *J = cast<Instruction>(U);
4770             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
4771           }))
4772         addToWorklistIfAllowed(OI);
4773     }
4774   }
4775 
4776   // For an instruction to be added into Worklist above, all its users inside
4777   // the loop should also be in Worklist. However, this condition cannot be
4778   // true for phi nodes that form a cyclic dependence. We must process phi
4779   // nodes separately. An induction variable will remain uniform if all users
4780   // of the induction variable and induction variable update remain uniform.
4781   // The code below handles both pointer and non-pointer induction variables.
4782   for (auto &Induction : Legal->getInductionVars()) {
4783     auto *Ind = Induction.first;
4784     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4785 
4786     // Determine if all users of the induction variable are uniform after
4787     // vectorization.
4788     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4789       auto *I = cast<Instruction>(U);
4790       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4791              isVectorizedMemAccessUse(I, Ind);
4792     });
4793     if (!UniformInd)
4794       continue;
4795 
4796     // Determine if all users of the induction variable update instruction are
4797     // uniform after vectorization.
4798     auto UniformIndUpdate =
4799         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4800           auto *I = cast<Instruction>(U);
4801           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4802                  isVectorizedMemAccessUse(I, IndUpdate);
4803         });
4804     if (!UniformIndUpdate)
4805       continue;
4806 
4807     // The induction variable and its update instruction will remain uniform.
4808     addToWorklistIfAllowed(Ind);
4809     addToWorklistIfAllowed(IndUpdate);
4810   }
4811 
4812   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4813 }
4814 
4815 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4816   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4817 
4818   if (Legal->getRuntimePointerChecking()->Need) {
4819     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4820         "runtime pointer checks needed. Enable vectorization of this "
4821         "loop with '#pragma clang loop vectorize(enable)' when "
4822         "compiling with -Os/-Oz",
4823         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4824     return true;
4825   }
4826 
4827   if (!PSE.getPredicate().isAlwaysTrue()) {
4828     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4829         "runtime SCEV checks needed. Enable vectorization of this "
4830         "loop with '#pragma clang loop vectorize(enable)' when "
4831         "compiling with -Os/-Oz",
4832         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4833     return true;
4834   }
4835 
4836   // FIXME: Avoid specializing for stride==1 instead of bailing out.
4837   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4838     reportVectorizationFailure("Runtime stride check for small trip count",
4839         "runtime stride == 1 checks needed. Enable vectorization of "
4840         "this loop without such check by compiling with -Os/-Oz",
4841         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4842     return true;
4843   }
4844 
4845   return false;
4846 }
4847 
4848 ElementCount
4849 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
4850   if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
4851     return ElementCount::getScalable(0);
4852 
4853   if (Hints->isScalableVectorizationDisabled()) {
4854     reportVectorizationInfo("Scalable vectorization is explicitly disabled",
4855                             "ScalableVectorizationDisabled", ORE, TheLoop);
4856     return ElementCount::getScalable(0);
4857   }
4858 
4859   LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
4860 
4861   auto MaxScalableVF = ElementCount::getScalable(
4862       std::numeric_limits<ElementCount::ScalarTy>::max());
4863 
4864   // Test that the loop-vectorizer can legalize all operations for this MaxVF.
4865   // FIXME: While for scalable vectors this is currently sufficient, this should
4866   // be replaced by a more detailed mechanism that filters out specific VFs,
4867   // instead of invalidating vectorization for a whole set of VFs based on the
4868   // MaxVF.
4869 
4870   // Disable scalable vectorization if the loop contains unsupported reductions.
4871   if (!canVectorizeReductions(MaxScalableVF)) {
4872     reportVectorizationInfo(
4873         "Scalable vectorization not supported for the reduction "
4874         "operations found in this loop.",
4875         "ScalableVFUnfeasible", ORE, TheLoop);
4876     return ElementCount::getScalable(0);
4877   }
4878 
4879   // Disable scalable vectorization if the loop contains any instructions
4880   // with element types not supported for scalable vectors.
4881   if (any_of(ElementTypesInLoop, [&](Type *Ty) {
4882         return !Ty->isVoidTy() &&
4883                !this->TTI.isElementTypeLegalForScalableVector(Ty);
4884       })) {
4885     reportVectorizationInfo("Scalable vectorization is not supported "
4886                             "for all element types found in this loop.",
4887                             "ScalableVFUnfeasible", ORE, TheLoop);
4888     return ElementCount::getScalable(0);
4889   }
4890 
4891   if (Legal->isSafeForAnyVectorWidth())
4892     return MaxScalableVF;
4893 
4894   // Limit MaxScalableVF by the maximum safe dependence distance.
4895   Optional<unsigned> MaxVScale = TTI.getMaxVScale();
4896   if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange))
4897     MaxVScale =
4898         TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
4899   MaxScalableVF = ElementCount::getScalable(
4900       MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
4901   if (!MaxScalableVF)
4902     reportVectorizationInfo(
4903         "Max legal vector width too small, scalable vectorization "
4904         "unfeasible.",
4905         "ScalableVFUnfeasible", ORE, TheLoop);
4906 
4907   return MaxScalableVF;
4908 }
4909 
4910 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
4911     unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) {
4912   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4913   unsigned SmallestType, WidestType;
4914   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4915 
4916   // Get the maximum safe dependence distance in bits computed by LAA.
4917   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4918   // the memory accesses that is most restrictive (involved in the smallest
4919   // dependence distance).
4920   unsigned MaxSafeElements =
4921       PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
4922 
4923   auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
4924   auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
4925 
4926   LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
4927                     << ".\n");
4928   LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
4929                     << ".\n");
4930 
4931   // First analyze the UserVF, fall back if the UserVF should be ignored.
4932   if (UserVF) {
4933     auto MaxSafeUserVF =
4934         UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
4935 
4936     if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
4937       // If `VF=vscale x N` is safe, then so is `VF=N`
4938       if (UserVF.isScalable())
4939         return FixedScalableVFPair(
4940             ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
4941       else
4942         return UserVF;
4943     }
4944 
4945     assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
4946 
4947     // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
4948     // is better to ignore the hint and let the compiler choose a suitable VF.
4949     if (!UserVF.isScalable()) {
4950       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4951                         << " is unsafe, clamping to max safe VF="
4952                         << MaxSafeFixedVF << ".\n");
4953       ORE->emit([&]() {
4954         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4955                                           TheLoop->getStartLoc(),
4956                                           TheLoop->getHeader())
4957                << "User-specified vectorization factor "
4958                << ore::NV("UserVectorizationFactor", UserVF)
4959                << " is unsafe, clamping to maximum safe vectorization factor "
4960                << ore::NV("VectorizationFactor", MaxSafeFixedVF);
4961       });
4962       return MaxSafeFixedVF;
4963     }
4964 
4965     if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
4966       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4967                         << " is ignored because scalable vectors are not "
4968                            "available.\n");
4969       ORE->emit([&]() {
4970         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4971                                           TheLoop->getStartLoc(),
4972                                           TheLoop->getHeader())
4973                << "User-specified vectorization factor "
4974                << ore::NV("UserVectorizationFactor", UserVF)
4975                << " is ignored because the target does not support scalable "
4976                   "vectors. The compiler will pick a more suitable value.";
4977       });
4978     } else {
4979       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4980                         << " is unsafe. Ignoring scalable UserVF.\n");
4981       ORE->emit([&]() {
4982         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4983                                           TheLoop->getStartLoc(),
4984                                           TheLoop->getHeader())
4985                << "User-specified vectorization factor "
4986                << ore::NV("UserVectorizationFactor", UserVF)
4987                << " is unsafe. Ignoring the hint to let the compiler pick a "
4988                   "more suitable value.";
4989       });
4990     }
4991   }
4992 
4993   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4994                     << " / " << WidestType << " bits.\n");
4995 
4996   FixedScalableVFPair Result(ElementCount::getFixed(1),
4997                              ElementCount::getScalable(0));
4998   if (auto MaxVF =
4999           getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5000                                   MaxSafeFixedVF, FoldTailByMasking))
5001     Result.FixedVF = MaxVF;
5002 
5003   if (auto MaxVF =
5004           getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5005                                   MaxSafeScalableVF, FoldTailByMasking))
5006     if (MaxVF.isScalable()) {
5007       Result.ScalableVF = MaxVF;
5008       LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
5009                         << "\n");
5010     }
5011 
5012   return Result;
5013 }
5014 
5015 FixedScalableVFPair
5016 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5017   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5018     // TODO: It may by useful to do since it's still likely to be dynamically
5019     // uniform if the target can skip.
5020     reportVectorizationFailure(
5021         "Not inserting runtime ptr check for divergent target",
5022         "runtime pointer checks needed. Not enabled for divergent target",
5023         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5024     return FixedScalableVFPair::getNone();
5025   }
5026 
5027   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5028   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5029   if (TC == 1) {
5030     reportVectorizationFailure("Single iteration (non) loop",
5031         "loop trip count is one, irrelevant for vectorization",
5032         "SingleIterationLoop", ORE, TheLoop);
5033     return FixedScalableVFPair::getNone();
5034   }
5035 
5036   switch (ScalarEpilogueStatus) {
5037   case CM_ScalarEpilogueAllowed:
5038     return computeFeasibleMaxVF(TC, UserVF, false);
5039   case CM_ScalarEpilogueNotAllowedUsePredicate:
5040     LLVM_FALLTHROUGH;
5041   case CM_ScalarEpilogueNotNeededUsePredicate:
5042     LLVM_DEBUG(
5043         dbgs() << "LV: vector predicate hint/switch found.\n"
5044                << "LV: Not allowing scalar epilogue, creating predicated "
5045                << "vector loop.\n");
5046     break;
5047   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5048     // fallthrough as a special case of OptForSize
5049   case CM_ScalarEpilogueNotAllowedOptSize:
5050     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5051       LLVM_DEBUG(
5052           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5053     else
5054       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5055                         << "count.\n");
5056 
5057     // Bail if runtime checks are required, which are not good when optimising
5058     // for size.
5059     if (runtimeChecksRequired())
5060       return FixedScalableVFPair::getNone();
5061 
5062     break;
5063   }
5064 
5065   // The only loops we can vectorize without a scalar epilogue, are loops with
5066   // a bottom-test and a single exiting block. We'd have to handle the fact
5067   // that not every instruction executes on the last iteration.  This will
5068   // require a lane mask which varies through the vector loop body.  (TODO)
5069   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
5070     // If there was a tail-folding hint/switch, but we can't fold the tail by
5071     // masking, fallback to a vectorization with a scalar epilogue.
5072     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5073       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5074                            "scalar epilogue instead.\n");
5075       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5076       return computeFeasibleMaxVF(TC, UserVF, false);
5077     }
5078     return FixedScalableVFPair::getNone();
5079   }
5080 
5081   // Now try the tail folding
5082 
5083   // Invalidate interleave groups that require an epilogue if we can't mask
5084   // the interleave-group.
5085   if (!useMaskedInterleavedAccesses(TTI)) {
5086     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5087            "No decisions should have been taken at this point");
5088     // Note: There is no need to invalidate any cost modeling decisions here, as
5089     // non where taken so far.
5090     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5091   }
5092 
5093   FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true);
5094   // Avoid tail folding if the trip count is known to be a multiple of any VF
5095   // we chose.
5096   // FIXME: The condition below pessimises the case for fixed-width vectors,
5097   // when scalable VFs are also candidates for vectorization.
5098   if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) {
5099     ElementCount MaxFixedVF = MaxFactors.FixedVF;
5100     assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) &&
5101            "MaxFixedVF must be a power of 2");
5102     unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC
5103                                    : MaxFixedVF.getFixedValue();
5104     ScalarEvolution *SE = PSE.getSE();
5105     const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
5106     const SCEV *ExitCount = SE->getAddExpr(
5107         BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
5108     const SCEV *Rem = SE->getURemExpr(
5109         SE->applyLoopGuards(ExitCount, TheLoop),
5110         SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
5111     if (Rem->isZero()) {
5112       // Accept MaxFixedVF if we do not have a tail.
5113       LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5114       return MaxFactors;
5115     }
5116   }
5117 
5118   // If we don't know the precise trip count, or if the trip count that we
5119   // found modulo the vectorization factor is not zero, try to fold the tail
5120   // by masking.
5121   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5122   if (Legal->prepareToFoldTailByMasking()) {
5123     FoldTailByMasking = true;
5124     return MaxFactors;
5125   }
5126 
5127   // If there was a tail-folding hint/switch, but we can't fold the tail by
5128   // masking, fallback to a vectorization with a scalar epilogue.
5129   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5130     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5131                          "scalar epilogue instead.\n");
5132     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5133     return MaxFactors;
5134   }
5135 
5136   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
5137     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5138     return FixedScalableVFPair::getNone();
5139   }
5140 
5141   if (TC == 0) {
5142     reportVectorizationFailure(
5143         "Unable to calculate the loop count due to complex control flow",
5144         "unable to calculate the loop count due to complex control flow",
5145         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5146     return FixedScalableVFPair::getNone();
5147   }
5148 
5149   reportVectorizationFailure(
5150       "Cannot optimize for size and vectorize at the same time.",
5151       "cannot optimize for size and vectorize at the same time. "
5152       "Enable vectorization of this loop with '#pragma clang loop "
5153       "vectorize(enable)' when compiling with -Os/-Oz",
5154       "NoTailLoopWithOptForSize", ORE, TheLoop);
5155   return FixedScalableVFPair::getNone();
5156 }
5157 
5158 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
5159     unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
5160     ElementCount MaxSafeVF, bool FoldTailByMasking) {
5161   bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
5162   TypeSize WidestRegister = TTI.getRegisterBitWidth(
5163       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5164                            : TargetTransformInfo::RGK_FixedWidthVector);
5165 
5166   // Convenience function to return the minimum of two ElementCounts.
5167   auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
5168     assert((LHS.isScalable() == RHS.isScalable()) &&
5169            "Scalable flags must match");
5170     return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
5171   };
5172 
5173   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5174   // Note that both WidestRegister and WidestType may not be a powers of 2.
5175   auto MaxVectorElementCount = ElementCount::get(
5176       PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType),
5177       ComputeScalableMaxVF);
5178   MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
5179   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5180                     << (MaxVectorElementCount * WidestType) << " bits.\n");
5181 
5182   if (!MaxVectorElementCount) {
5183     LLVM_DEBUG(dbgs() << "LV: The target has no "
5184                       << (ComputeScalableMaxVF ? "scalable" : "fixed")
5185                       << " vector registers.\n");
5186     return ElementCount::getFixed(1);
5187   }
5188 
5189   const auto TripCountEC = ElementCount::getFixed(ConstTripCount);
5190   if (ConstTripCount &&
5191       ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) &&
5192       (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) {
5193     // If loop trip count (TC) is known at compile time there is no point in
5194     // choosing VF greater than TC (as done in the loop below). Select maximum
5195     // power of two which doesn't exceed TC.
5196     // If MaxVectorElementCount is scalable, we only fall back on a fixed VF
5197     // when the TC is less than or equal to the known number of lanes.
5198     auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount);
5199     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
5200                          "exceeding the constant trip count: "
5201                       << ClampedConstTripCount << "\n");
5202     return ElementCount::getFixed(ClampedConstTripCount);
5203   }
5204 
5205   TargetTransformInfo::RegisterKind RegKind =
5206       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5207                            : TargetTransformInfo::RGK_FixedWidthVector;
5208   ElementCount MaxVF = MaxVectorElementCount;
5209   if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
5210                             TTI.shouldMaximizeVectorBandwidth(RegKind))) {
5211     auto MaxVectorElementCountMaxBW = ElementCount::get(
5212         PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType),
5213         ComputeScalableMaxVF);
5214     MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
5215 
5216     // Collect all viable vectorization factors larger than the default MaxVF
5217     // (i.e. MaxVectorElementCount).
5218     SmallVector<ElementCount, 8> VFs;
5219     for (ElementCount VS = MaxVectorElementCount * 2;
5220          ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
5221       VFs.push_back(VS);
5222 
5223     // For each VF calculate its register usage.
5224     auto RUs = calculateRegisterUsage(VFs);
5225 
5226     // Select the largest VF which doesn't require more registers than existing
5227     // ones.
5228     for (int i = RUs.size() - 1; i >= 0; --i) {
5229       bool Selected = true;
5230       for (auto &pair : RUs[i].MaxLocalUsers) {
5231         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5232         if (pair.second > TargetNumRegisters)
5233           Selected = false;
5234       }
5235       if (Selected) {
5236         MaxVF = VFs[i];
5237         break;
5238       }
5239     }
5240     if (ElementCount MinVF =
5241             TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
5242       if (ElementCount::isKnownLT(MaxVF, MinVF)) {
5243         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5244                           << ") with target's minimum: " << MinVF << '\n');
5245         MaxVF = MinVF;
5246       }
5247     }
5248 
5249     // Invalidate any widening decisions we might have made, in case the loop
5250     // requires prediction (decided later), but we have already made some
5251     // load/store widening decisions.
5252     invalidateCostModelingDecisions();
5253   }
5254   return MaxVF;
5255 }
5256 
5257 Optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const {
5258   if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
5259     auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
5260     auto Min = Attr.getVScaleRangeMin();
5261     auto Max = Attr.getVScaleRangeMax();
5262     if (Max && Min == Max)
5263       return Max;
5264   }
5265 
5266   return TTI.getVScaleForTuning();
5267 }
5268 
5269 bool LoopVectorizationCostModel::isMoreProfitable(
5270     const VectorizationFactor &A, const VectorizationFactor &B) const {
5271   InstructionCost CostA = A.Cost;
5272   InstructionCost CostB = B.Cost;
5273 
5274   unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
5275 
5276   if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking &&
5277       MaxTripCount) {
5278     // If we are folding the tail and the trip count is a known (possibly small)
5279     // constant, the trip count will be rounded up to an integer number of
5280     // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF),
5281     // which we compare directly. When not folding the tail, the total cost will
5282     // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is
5283     // approximated with the per-lane cost below instead of using the tripcount
5284     // as here.
5285     auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue());
5286     auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue());
5287     return RTCostA < RTCostB;
5288   }
5289 
5290   // Improve estimate for the vector width if it is scalable.
5291   unsigned EstimatedWidthA = A.Width.getKnownMinValue();
5292   unsigned EstimatedWidthB = B.Width.getKnownMinValue();
5293   if (Optional<unsigned> VScale = getVScaleForTuning()) {
5294     if (A.Width.isScalable())
5295       EstimatedWidthA *= VScale.getValue();
5296     if (B.Width.isScalable())
5297       EstimatedWidthB *= VScale.getValue();
5298   }
5299 
5300   // Assume vscale may be larger than 1 (or the value being tuned for),
5301   // so that scalable vectorization is slightly favorable over fixed-width
5302   // vectorization.
5303   if (A.Width.isScalable() && !B.Width.isScalable())
5304     return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
5305 
5306   // To avoid the need for FP division:
5307   //      (CostA / A.Width) < (CostB / B.Width)
5308   // <=>  (CostA * B.Width) < (CostB * A.Width)
5309   return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
5310 }
5311 
5312 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
5313     const ElementCountSet &VFCandidates) {
5314   InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
5315   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
5316   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
5317   assert(VFCandidates.count(ElementCount::getFixed(1)) &&
5318          "Expected Scalar VF to be a candidate");
5319 
5320   const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost);
5321   VectorizationFactor ChosenFactor = ScalarCost;
5322 
5323   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5324   if (ForceVectorization && VFCandidates.size() > 1) {
5325     // Ignore scalar width, because the user explicitly wants vectorization.
5326     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5327     // evaluation.
5328     ChosenFactor.Cost = InstructionCost::getMax();
5329   }
5330 
5331   SmallVector<InstructionVFPair> InvalidCosts;
5332   for (const auto &i : VFCandidates) {
5333     // The cost for scalar VF=1 is already calculated, so ignore it.
5334     if (i.isScalar())
5335       continue;
5336 
5337     VectorizationCostTy C = expectedCost(i, &InvalidCosts);
5338     VectorizationFactor Candidate(i, C.first);
5339 
5340 #ifndef NDEBUG
5341     unsigned AssumedMinimumVscale = 1;
5342     if (Optional<unsigned> VScale = getVScaleForTuning())
5343       AssumedMinimumVscale = VScale.getValue();
5344     unsigned Width =
5345         Candidate.Width.isScalable()
5346             ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
5347             : Candidate.Width.getFixedValue();
5348     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5349                       << " costs: " << (Candidate.Cost / Width));
5350     if (i.isScalable())
5351       LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
5352                         << AssumedMinimumVscale << ")");
5353     LLVM_DEBUG(dbgs() << ".\n");
5354 #endif
5355 
5356     if (!C.second && !ForceVectorization) {
5357       LLVM_DEBUG(
5358           dbgs() << "LV: Not considering vector loop of width " << i
5359                  << " because it will not generate any vector instructions.\n");
5360       continue;
5361     }
5362 
5363     // If profitable add it to ProfitableVF list.
5364     if (isMoreProfitable(Candidate, ScalarCost))
5365       ProfitableVFs.push_back(Candidate);
5366 
5367     if (isMoreProfitable(Candidate, ChosenFactor))
5368       ChosenFactor = Candidate;
5369   }
5370 
5371   // Emit a report of VFs with invalid costs in the loop.
5372   if (!InvalidCosts.empty()) {
5373     // Group the remarks per instruction, keeping the instruction order from
5374     // InvalidCosts.
5375     std::map<Instruction *, unsigned> Numbering;
5376     unsigned I = 0;
5377     for (auto &Pair : InvalidCosts)
5378       if (!Numbering.count(Pair.first))
5379         Numbering[Pair.first] = I++;
5380 
5381     // Sort the list, first on instruction(number) then on VF.
5382     llvm::sort(InvalidCosts,
5383                [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
5384                  if (Numbering[A.first] != Numbering[B.first])
5385                    return Numbering[A.first] < Numbering[B.first];
5386                  ElementCountComparator ECC;
5387                  return ECC(A.second, B.second);
5388                });
5389 
5390     // For a list of ordered instruction-vf pairs:
5391     //   [(load, vf1), (load, vf2), (store, vf1)]
5392     // Group the instructions together to emit separate remarks for:
5393     //   load  (vf1, vf2)
5394     //   store (vf1)
5395     auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
5396     auto Subset = ArrayRef<InstructionVFPair>();
5397     do {
5398       if (Subset.empty())
5399         Subset = Tail.take_front(1);
5400 
5401       Instruction *I = Subset.front().first;
5402 
5403       // If the next instruction is different, or if there are no other pairs,
5404       // emit a remark for the collated subset. e.g.
5405       //   [(load, vf1), (load, vf2))]
5406       // to emit:
5407       //  remark: invalid costs for 'load' at VF=(vf, vf2)
5408       if (Subset == Tail || Tail[Subset.size()].first != I) {
5409         std::string OutString;
5410         raw_string_ostream OS(OutString);
5411         assert(!Subset.empty() && "Unexpected empty range");
5412         OS << "Instruction with invalid costs prevented vectorization at VF=(";
5413         for (auto &Pair : Subset)
5414           OS << (Pair.second == Subset.front().second ? "" : ", ")
5415              << Pair.second;
5416         OS << "):";
5417         if (auto *CI = dyn_cast<CallInst>(I))
5418           OS << " call to " << CI->getCalledFunction()->getName();
5419         else
5420           OS << " " << I->getOpcodeName();
5421         OS.flush();
5422         reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
5423         Tail = Tail.drop_front(Subset.size());
5424         Subset = {};
5425       } else
5426         // Grow the subset by one element
5427         Subset = Tail.take_front(Subset.size() + 1);
5428     } while (!Tail.empty());
5429   }
5430 
5431   if (!EnableCondStoresVectorization && NumPredStores) {
5432     reportVectorizationFailure("There are conditional stores.",
5433         "store that is conditionally executed prevents vectorization",
5434         "ConditionalStore", ORE, TheLoop);
5435     ChosenFactor = ScalarCost;
5436   }
5437 
5438   LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
5439                  ChosenFactor.Cost >= ScalarCost.Cost) dbgs()
5440              << "LV: Vectorization seems to be not beneficial, "
5441              << "but was forced by a user.\n");
5442   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
5443   return ChosenFactor;
5444 }
5445 
5446 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
5447     const Loop &L, ElementCount VF) const {
5448   // Cross iteration phis such as reductions need special handling and are
5449   // currently unsupported.
5450   if (any_of(L.getHeader()->phis(),
5451              [&](PHINode &Phi) { return Legal->isFirstOrderRecurrence(&Phi); }))
5452     return false;
5453 
5454   // Phis with uses outside of the loop require special handling and are
5455   // currently unsupported.
5456   for (auto &Entry : Legal->getInductionVars()) {
5457     // Look for uses of the value of the induction at the last iteration.
5458     Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
5459     for (User *U : PostInc->users())
5460       if (!L.contains(cast<Instruction>(U)))
5461         return false;
5462     // Look for uses of penultimate value of the induction.
5463     for (User *U : Entry.first->users())
5464       if (!L.contains(cast<Instruction>(U)))
5465         return false;
5466   }
5467 
5468   // Induction variables that are widened require special handling that is
5469   // currently not supported.
5470   if (any_of(Legal->getInductionVars(), [&](auto &Entry) {
5471         return !(this->isScalarAfterVectorization(Entry.first, VF) ||
5472                  this->isProfitableToScalarize(Entry.first, VF));
5473       }))
5474     return false;
5475 
5476   // Epilogue vectorization code has not been auditted to ensure it handles
5477   // non-latch exits properly.  It may be fine, but it needs auditted and
5478   // tested.
5479   if (L.getExitingBlock() != L.getLoopLatch())
5480     return false;
5481 
5482   return true;
5483 }
5484 
5485 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5486     const ElementCount VF) const {
5487   // FIXME: We need a much better cost-model to take different parameters such
5488   // as register pressure, code size increase and cost of extra branches into
5489   // account. For now we apply a very crude heuristic and only consider loops
5490   // with vectorization factors larger than a certain value.
5491   // We also consider epilogue vectorization unprofitable for targets that don't
5492   // consider interleaving beneficial (eg. MVE).
5493   if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
5494     return false;
5495   // FIXME: We should consider changing the threshold for scalable
5496   // vectors to take VScaleForTuning into account.
5497   if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF)
5498     return true;
5499   return false;
5500 }
5501 
5502 VectorizationFactor
5503 LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
5504     const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
5505   VectorizationFactor Result = VectorizationFactor::Disabled();
5506   if (!EnableEpilogueVectorization) {
5507     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
5508     return Result;
5509   }
5510 
5511   if (!isScalarEpilogueAllowed()) {
5512     LLVM_DEBUG(
5513         dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
5514                   "allowed.\n";);
5515     return Result;
5516   }
5517 
5518   // Not really a cost consideration, but check for unsupported cases here to
5519   // simplify the logic.
5520   if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
5521     LLVM_DEBUG(
5522         dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
5523                   "not a supported candidate.\n";);
5524     return Result;
5525   }
5526 
5527   if (EpilogueVectorizationForceVF > 1) {
5528     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
5529     ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
5530     if (LVP.hasPlanWithVF(ForcedEC))
5531       return {ForcedEC, 0};
5532     else {
5533       LLVM_DEBUG(
5534           dbgs()
5535               << "LEV: Epilogue vectorization forced factor is not viable.\n";);
5536       return Result;
5537     }
5538   }
5539 
5540   if (TheLoop->getHeader()->getParent()->hasOptSize() ||
5541       TheLoop->getHeader()->getParent()->hasMinSize()) {
5542     LLVM_DEBUG(
5543         dbgs()
5544             << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
5545     return Result;
5546   }
5547 
5548   if (!isEpilogueVectorizationProfitable(MainLoopVF)) {
5549     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
5550                          "this loop\n");
5551     return Result;
5552   }
5553 
5554   // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
5555   // the main loop handles 8 lanes per iteration. We could still benefit from
5556   // vectorizing the epilogue loop with VF=4.
5557   ElementCount EstimatedRuntimeVF = MainLoopVF;
5558   if (MainLoopVF.isScalable()) {
5559     EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
5560     if (Optional<unsigned> VScale = getVScaleForTuning())
5561       EstimatedRuntimeVF *= VScale.getValue();
5562   }
5563 
5564   for (auto &NextVF : ProfitableVFs)
5565     if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
5566           ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) ||
5567          ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) &&
5568         (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) &&
5569         LVP.hasPlanWithVF(NextVF.Width))
5570       Result = NextVF;
5571 
5572   if (Result != VectorizationFactor::Disabled())
5573     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5574                       << Result.Width << "\n";);
5575   return Result;
5576 }
5577 
5578 std::pair<unsigned, unsigned>
5579 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5580   unsigned MinWidth = -1U;
5581   unsigned MaxWidth = 8;
5582   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5583   // For in-loop reductions, no element types are added to ElementTypesInLoop
5584   // if there are no loads/stores in the loop. In this case, check through the
5585   // reduction variables to determine the maximum width.
5586   if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
5587     // Reset MaxWidth so that we can find the smallest type used by recurrences
5588     // in the loop.
5589     MaxWidth = -1U;
5590     for (auto &PhiDescriptorPair : Legal->getReductionVars()) {
5591       const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
5592       // When finding the min width used by the recurrence we need to account
5593       // for casts on the input operands of the recurrence.
5594       MaxWidth = std::min<unsigned>(
5595           MaxWidth, std::min<unsigned>(
5596                         RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
5597                         RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
5598     }
5599   } else {
5600     for (Type *T : ElementTypesInLoop) {
5601       MinWidth = std::min<unsigned>(
5602           MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
5603       MaxWidth = std::max<unsigned>(
5604           MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
5605     }
5606   }
5607   return {MinWidth, MaxWidth};
5608 }
5609 
5610 void LoopVectorizationCostModel::collectElementTypesForWidening() {
5611   ElementTypesInLoop.clear();
5612   // For each block.
5613   for (BasicBlock *BB : TheLoop->blocks()) {
5614     // For each instruction in the loop.
5615     for (Instruction &I : BB->instructionsWithoutDebug()) {
5616       Type *T = I.getType();
5617 
5618       // Skip ignored values.
5619       if (ValuesToIgnore.count(&I))
5620         continue;
5621 
5622       // Only examine Loads, Stores and PHINodes.
5623       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5624         continue;
5625 
5626       // Examine PHI nodes that are reduction variables. Update the type to
5627       // account for the recurrence type.
5628       if (auto *PN = dyn_cast<PHINode>(&I)) {
5629         if (!Legal->isReductionVariable(PN))
5630           continue;
5631         const RecurrenceDescriptor &RdxDesc =
5632             Legal->getReductionVars().find(PN)->second;
5633         if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
5634             TTI.preferInLoopReduction(RdxDesc.getOpcode(),
5635                                       RdxDesc.getRecurrenceType(),
5636                                       TargetTransformInfo::ReductionFlags()))
5637           continue;
5638         T = RdxDesc.getRecurrenceType();
5639       }
5640 
5641       // Examine the stored values.
5642       if (auto *ST = dyn_cast<StoreInst>(&I))
5643         T = ST->getValueOperand()->getType();
5644 
5645       assert(T->isSized() &&
5646              "Expected the load/store/recurrence type to be sized");
5647 
5648       ElementTypesInLoop.insert(T);
5649     }
5650   }
5651 }
5652 
5653 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5654                                                            unsigned LoopCost) {
5655   // -- The interleave heuristics --
5656   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5657   // There are many micro-architectural considerations that we can't predict
5658   // at this level. For example, frontend pressure (on decode or fetch) due to
5659   // code size, or the number and capabilities of the execution ports.
5660   //
5661   // We use the following heuristics to select the interleave count:
5662   // 1. If the code has reductions, then we interleave to break the cross
5663   // iteration dependency.
5664   // 2. If the loop is really small, then we interleave to reduce the loop
5665   // overhead.
5666   // 3. We don't interleave if we think that we will spill registers to memory
5667   // due to the increased register pressure.
5668 
5669   if (!isScalarEpilogueAllowed())
5670     return 1;
5671 
5672   // We used the distance for the interleave count.
5673   if (Legal->getMaxSafeDepDistBytes() != -1U)
5674     return 1;
5675 
5676   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5677   const bool HasReductions = !Legal->getReductionVars().empty();
5678   // Do not interleave loops with a relatively small known or estimated trip
5679   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
5680   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
5681   // because with the above conditions interleaving can expose ILP and break
5682   // cross iteration dependences for reductions.
5683   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
5684       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
5685     return 1;
5686 
5687   // If we did not calculate the cost for VF (because the user selected the VF)
5688   // then we calculate the cost of VF here.
5689   if (LoopCost == 0) {
5690     InstructionCost C = expectedCost(VF).first;
5691     assert(C.isValid() && "Expected to have chosen a VF with valid cost");
5692     LoopCost = *C.getValue();
5693 
5694     // Loop body is free and there is no need for interleaving.
5695     if (LoopCost == 0)
5696       return 1;
5697   }
5698 
5699   RegisterUsage R = calculateRegisterUsage({VF})[0];
5700   // We divide by these constants so assume that we have at least one
5701   // instruction that uses at least one register.
5702   for (auto& pair : R.MaxLocalUsers) {
5703     pair.second = std::max(pair.second, 1U);
5704   }
5705 
5706   // We calculate the interleave count using the following formula.
5707   // Subtract the number of loop invariants from the number of available
5708   // registers. These registers are used by all of the interleaved instances.
5709   // Next, divide the remaining registers by the number of registers that is
5710   // required by the loop, in order to estimate how many parallel instances
5711   // fit without causing spills. All of this is rounded down if necessary to be
5712   // a power of two. We want power of two interleave count to simplify any
5713   // addressing operations or alignment considerations.
5714   // We also want power of two interleave counts to ensure that the induction
5715   // variable of the vector loop wraps to zero, when tail is folded by masking;
5716   // this currently happens when OptForSize, in which case IC is set to 1 above.
5717   unsigned IC = UINT_MAX;
5718 
5719   for (auto& pair : R.MaxLocalUsers) {
5720     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5721     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5722                       << " registers of "
5723                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5724     if (VF.isScalar()) {
5725       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5726         TargetNumRegisters = ForceTargetNumScalarRegs;
5727     } else {
5728       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5729         TargetNumRegisters = ForceTargetNumVectorRegs;
5730     }
5731     unsigned MaxLocalUsers = pair.second;
5732     unsigned LoopInvariantRegs = 0;
5733     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5734       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5735 
5736     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5737     // Don't count the induction variable as interleaved.
5738     if (EnableIndVarRegisterHeur) {
5739       TmpIC =
5740           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5741                         std::max(1U, (MaxLocalUsers - 1)));
5742     }
5743 
5744     IC = std::min(IC, TmpIC);
5745   }
5746 
5747   // Clamp the interleave ranges to reasonable counts.
5748   unsigned MaxInterleaveCount =
5749       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
5750 
5751   // Check if the user has overridden the max.
5752   if (VF.isScalar()) {
5753     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5754       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5755   } else {
5756     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5757       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5758   }
5759 
5760   // If trip count is known or estimated compile time constant, limit the
5761   // interleave count to be less than the trip count divided by VF, provided it
5762   // is at least 1.
5763   //
5764   // For scalable vectors we can't know if interleaving is beneficial. It may
5765   // not be beneficial for small loops if none of the lanes in the second vector
5766   // iterations is enabled. However, for larger loops, there is likely to be a
5767   // similar benefit as for fixed-width vectors. For now, we choose to leave
5768   // the InterleaveCount as if vscale is '1', although if some information about
5769   // the vector is known (e.g. min vector size), we can make a better decision.
5770   if (BestKnownTC) {
5771     MaxInterleaveCount =
5772         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
5773     // Make sure MaxInterleaveCount is greater than 0.
5774     MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
5775   }
5776 
5777   assert(MaxInterleaveCount > 0 &&
5778          "Maximum interleave count must be greater than 0");
5779 
5780   // Clamp the calculated IC to be between the 1 and the max interleave count
5781   // that the target and trip count allows.
5782   if (IC > MaxInterleaveCount)
5783     IC = MaxInterleaveCount;
5784   else
5785     // Make sure IC is greater than 0.
5786     IC = std::max(1u, IC);
5787 
5788   assert(IC > 0 && "Interleave count must be greater than 0.");
5789 
5790   // Interleave if we vectorized this loop and there is a reduction that could
5791   // benefit from interleaving.
5792   if (VF.isVector() && HasReductions) {
5793     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5794     return IC;
5795   }
5796 
5797   // For any scalar loop that either requires runtime checks or predication we
5798   // are better off leaving this to the unroller. Note that if we've already
5799   // vectorized the loop we will have done the runtime check and so interleaving
5800   // won't require further checks.
5801   bool ScalarInterleavingRequiresPredication =
5802       (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
5803          return Legal->blockNeedsPredication(BB);
5804        }));
5805   bool ScalarInterleavingRequiresRuntimePointerCheck =
5806       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
5807 
5808   // We want to interleave small loops in order to reduce the loop overhead and
5809   // potentially expose ILP opportunities.
5810   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5811                     << "LV: IC is " << IC << '\n'
5812                     << "LV: VF is " << VF << '\n');
5813   const bool AggressivelyInterleaveReductions =
5814       TTI.enableAggressiveInterleaving(HasReductions);
5815   if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5816       !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5817     // We assume that the cost overhead is 1 and we use the cost model
5818     // to estimate the cost of the loop and interleave until the cost of the
5819     // loop overhead is about 5% of the cost of the loop.
5820     unsigned SmallIC =
5821         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5822 
5823     // Interleave until store/load ports (estimated by max interleave count) are
5824     // saturated.
5825     unsigned NumStores = Legal->getNumStores();
5826     unsigned NumLoads = Legal->getNumLoads();
5827     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5828     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5829 
5830     // There is little point in interleaving for reductions containing selects
5831     // and compares when VF=1 since it may just create more overhead than it's
5832     // worth for loops with small trip counts. This is because we still have to
5833     // do the final reduction after the loop.
5834     bool HasSelectCmpReductions =
5835         HasReductions &&
5836         any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5837           const RecurrenceDescriptor &RdxDesc = Reduction.second;
5838           return RecurrenceDescriptor::isSelectCmpRecurrenceKind(
5839               RdxDesc.getRecurrenceKind());
5840         });
5841     if (HasSelectCmpReductions) {
5842       LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5843       return 1;
5844     }
5845 
5846     // If we have a scalar reduction (vector reductions are already dealt with
5847     // by this point), we can increase the critical path length if the loop
5848     // we're interleaving is inside another loop. For tree-wise reductions
5849     // set the limit to 2, and for ordered reductions it's best to disable
5850     // interleaving entirely.
5851     if (HasReductions && TheLoop->getLoopDepth() > 1) {
5852       bool HasOrderedReductions =
5853           any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5854             const RecurrenceDescriptor &RdxDesc = Reduction.second;
5855             return RdxDesc.isOrdered();
5856           });
5857       if (HasOrderedReductions) {
5858         LLVM_DEBUG(
5859             dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5860         return 1;
5861       }
5862 
5863       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5864       SmallIC = std::min(SmallIC, F);
5865       StoresIC = std::min(StoresIC, F);
5866       LoadsIC = std::min(LoadsIC, F);
5867     }
5868 
5869     if (EnableLoadStoreRuntimeInterleave &&
5870         std::max(StoresIC, LoadsIC) > SmallIC) {
5871       LLVM_DEBUG(
5872           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5873       return std::max(StoresIC, LoadsIC);
5874     }
5875 
5876     // If there are scalar reductions and TTI has enabled aggressive
5877     // interleaving for reductions, we will interleave to expose ILP.
5878     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
5879         AggressivelyInterleaveReductions) {
5880       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5881       // Interleave no less than SmallIC but not as aggressive as the normal IC
5882       // to satisfy the rare situation when resources are too limited.
5883       return std::max(IC / 2, SmallIC);
5884     } else {
5885       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5886       return SmallIC;
5887     }
5888   }
5889 
5890   // Interleave if this is a large loop (small loops are already dealt with by
5891   // this point) that could benefit from interleaving.
5892   if (AggressivelyInterleaveReductions) {
5893     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5894     return IC;
5895   }
5896 
5897   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5898   return 1;
5899 }
5900 
5901 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5902 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
5903   // This function calculates the register usage by measuring the highest number
5904   // of values that are alive at a single location. Obviously, this is a very
5905   // rough estimation. We scan the loop in a topological order in order and
5906   // assign a number to each instruction. We use RPO to ensure that defs are
5907   // met before their users. We assume that each instruction that has in-loop
5908   // users starts an interval. We record every time that an in-loop value is
5909   // used, so we have a list of the first and last occurrences of each
5910   // instruction. Next, we transpose this data structure into a multi map that
5911   // holds the list of intervals that *end* at a specific location. This multi
5912   // map allows us to perform a linear search. We scan the instructions linearly
5913   // and record each time that a new interval starts, by placing it in a set.
5914   // If we find this value in the multi-map then we remove it from the set.
5915   // The max register usage is the maximum size of the set.
5916   // We also search for instructions that are defined outside the loop, but are
5917   // used inside the loop. We need this number separately from the max-interval
5918   // usage number because when we unroll, loop-invariant values do not take
5919   // more register.
5920   LoopBlocksDFS DFS(TheLoop);
5921   DFS.perform(LI);
5922 
5923   RegisterUsage RU;
5924 
5925   // Each 'key' in the map opens a new interval. The values
5926   // of the map are the index of the 'last seen' usage of the
5927   // instruction that is the key.
5928   using IntervalMap = DenseMap<Instruction *, unsigned>;
5929 
5930   // Maps instruction to its index.
5931   SmallVector<Instruction *, 64> IdxToInstr;
5932   // Marks the end of each interval.
5933   IntervalMap EndPoint;
5934   // Saves the list of instruction indices that are used in the loop.
5935   SmallPtrSet<Instruction *, 8> Ends;
5936   // Saves the list of values that are used in the loop but are
5937   // defined outside the loop, such as arguments and constants.
5938   SmallPtrSet<Value *, 8> LoopInvariants;
5939 
5940   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5941     for (Instruction &I : BB->instructionsWithoutDebug()) {
5942       IdxToInstr.push_back(&I);
5943 
5944       // Save the end location of each USE.
5945       for (Value *U : I.operands()) {
5946         auto *Instr = dyn_cast<Instruction>(U);
5947 
5948         // Ignore non-instruction values such as arguments, constants, etc.
5949         if (!Instr)
5950           continue;
5951 
5952         // If this instruction is outside the loop then record it and continue.
5953         if (!TheLoop->contains(Instr)) {
5954           LoopInvariants.insert(Instr);
5955           continue;
5956         }
5957 
5958         // Overwrite previous end points.
5959         EndPoint[Instr] = IdxToInstr.size();
5960         Ends.insert(Instr);
5961       }
5962     }
5963   }
5964 
5965   // Saves the list of intervals that end with the index in 'key'.
5966   using InstrList = SmallVector<Instruction *, 2>;
5967   DenseMap<unsigned, InstrList> TransposeEnds;
5968 
5969   // Transpose the EndPoints to a list of values that end at each index.
5970   for (auto &Interval : EndPoint)
5971     TransposeEnds[Interval.second].push_back(Interval.first);
5972 
5973   SmallPtrSet<Instruction *, 8> OpenIntervals;
5974   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5975   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5976 
5977   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5978 
5979   auto GetRegUsage = [&TTI = TTI](Type *Ty, ElementCount VF) -> unsigned {
5980     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
5981       return 0;
5982     return TTI.getRegUsageForType(VectorType::get(Ty, VF));
5983   };
5984 
5985   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5986     Instruction *I = IdxToInstr[i];
5987 
5988     // Remove all of the instructions that end at this location.
5989     InstrList &List = TransposeEnds[i];
5990     for (Instruction *ToRemove : List)
5991       OpenIntervals.erase(ToRemove);
5992 
5993     // Ignore instructions that are never used within the loop.
5994     if (!Ends.count(I))
5995       continue;
5996 
5997     // Skip ignored values.
5998     if (ValuesToIgnore.count(I))
5999       continue;
6000 
6001     // For each VF find the maximum usage of registers.
6002     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
6003       // Count the number of live intervals.
6004       SmallMapVector<unsigned, unsigned, 4> RegUsage;
6005 
6006       if (VFs[j].isScalar()) {
6007         for (auto Inst : OpenIntervals) {
6008           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6009           if (RegUsage.find(ClassID) == RegUsage.end())
6010             RegUsage[ClassID] = 1;
6011           else
6012             RegUsage[ClassID] += 1;
6013         }
6014       } else {
6015         collectUniformsAndScalars(VFs[j]);
6016         for (auto Inst : OpenIntervals) {
6017           // Skip ignored values for VF > 1.
6018           if (VecValuesToIgnore.count(Inst))
6019             continue;
6020           if (isScalarAfterVectorization(Inst, VFs[j])) {
6021             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6022             if (RegUsage.find(ClassID) == RegUsage.end())
6023               RegUsage[ClassID] = 1;
6024             else
6025               RegUsage[ClassID] += 1;
6026           } else {
6027             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
6028             if (RegUsage.find(ClassID) == RegUsage.end())
6029               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
6030             else
6031               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
6032           }
6033         }
6034       }
6035 
6036       for (auto& pair : RegUsage) {
6037         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
6038           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
6039         else
6040           MaxUsages[j][pair.first] = pair.second;
6041       }
6042     }
6043 
6044     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
6045                       << OpenIntervals.size() << '\n');
6046 
6047     // Add the current instruction to the list of open intervals.
6048     OpenIntervals.insert(I);
6049   }
6050 
6051   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6052     SmallMapVector<unsigned, unsigned, 4> Invariant;
6053 
6054     for (auto Inst : LoopInvariants) {
6055       unsigned Usage =
6056           VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
6057       unsigned ClassID =
6058           TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
6059       if (Invariant.find(ClassID) == Invariant.end())
6060         Invariant[ClassID] = Usage;
6061       else
6062         Invariant[ClassID] += Usage;
6063     }
6064 
6065     LLVM_DEBUG({
6066       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
6067       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
6068              << " item\n";
6069       for (const auto &pair : MaxUsages[i]) {
6070         dbgs() << "LV(REG): RegisterClass: "
6071                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6072                << " registers\n";
6073       }
6074       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
6075              << " item\n";
6076       for (const auto &pair : Invariant) {
6077         dbgs() << "LV(REG): RegisterClass: "
6078                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6079                << " registers\n";
6080       }
6081     });
6082 
6083     RU.LoopInvariantRegs = Invariant;
6084     RU.MaxLocalUsers = MaxUsages[i];
6085     RUs[i] = RU;
6086   }
6087 
6088   return RUs;
6089 }
6090 
6091 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
6092                                                            ElementCount VF) {
6093   // TODO: Cost model for emulated masked load/store is completely
6094   // broken. This hack guides the cost model to use an artificially
6095   // high enough value to practically disable vectorization with such
6096   // operations, except where previously deployed legality hack allowed
6097   // using very low cost values. This is to avoid regressions coming simply
6098   // from moving "masked load/store" check from legality to cost model.
6099   // Masked Load/Gather emulation was previously never allowed.
6100   // Limited number of Masked Store/Scatter emulation was allowed.
6101   assert(isPredicatedInst(I, VF) && "Expecting a scalar emulated instruction");
6102   return isa<LoadInst>(I) ||
6103          (isa<StoreInst>(I) &&
6104           NumPredStores > NumberOfStoresToPredicate);
6105 }
6106 
6107 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
6108   // If we aren't vectorizing the loop, or if we've already collected the
6109   // instructions to scalarize, there's nothing to do. Collection may already
6110   // have occurred if we have a user-selected VF and are now computing the
6111   // expected cost for interleaving.
6112   if (VF.isScalar() || VF.isZero() ||
6113       InstsToScalarize.find(VF) != InstsToScalarize.end())
6114     return;
6115 
6116   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6117   // not profitable to scalarize any instructions, the presence of VF in the
6118   // map will indicate that we've analyzed it already.
6119   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6120 
6121   // Find all the instructions that are scalar with predication in the loop and
6122   // determine if it would be better to not if-convert the blocks they are in.
6123   // If so, we also record the instructions to scalarize.
6124   for (BasicBlock *BB : TheLoop->blocks()) {
6125     if (!blockNeedsPredicationForAnyReason(BB))
6126       continue;
6127     for (Instruction &I : *BB)
6128       if (isScalarWithPredication(&I, VF)) {
6129         ScalarCostsTy ScalarCosts;
6130         // Do not apply discount if scalable, because that would lead to
6131         // invalid scalarization costs.
6132         // Do not apply discount logic if hacked cost is needed
6133         // for emulated masked memrefs.
6134         if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) &&
6135             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6136           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6137         // Remember that BB will remain after vectorization.
6138         PredicatedBBsAfterVectorization.insert(BB);
6139       }
6140   }
6141 }
6142 
6143 int LoopVectorizationCostModel::computePredInstDiscount(
6144     Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
6145   assert(!isUniformAfterVectorization(PredInst, VF) &&
6146          "Instruction marked uniform-after-vectorization will be predicated");
6147 
6148   // Initialize the discount to zero, meaning that the scalar version and the
6149   // vector version cost the same.
6150   InstructionCost Discount = 0;
6151 
6152   // Holds instructions to analyze. The instructions we visit are mapped in
6153   // ScalarCosts. Those instructions are the ones that would be scalarized if
6154   // we find that the scalar version costs less.
6155   SmallVector<Instruction *, 8> Worklist;
6156 
6157   // Returns true if the given instruction can be scalarized.
6158   auto canBeScalarized = [&](Instruction *I) -> bool {
6159     // We only attempt to scalarize instructions forming a single-use chain
6160     // from the original predicated block that would otherwise be vectorized.
6161     // Although not strictly necessary, we give up on instructions we know will
6162     // already be scalar to avoid traversing chains that are unlikely to be
6163     // beneficial.
6164     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6165         isScalarAfterVectorization(I, VF))
6166       return false;
6167 
6168     // If the instruction is scalar with predication, it will be analyzed
6169     // separately. We ignore it within the context of PredInst.
6170     if (isScalarWithPredication(I, VF))
6171       return false;
6172 
6173     // If any of the instruction's operands are uniform after vectorization,
6174     // the instruction cannot be scalarized. This prevents, for example, a
6175     // masked load from being scalarized.
6176     //
6177     // We assume we will only emit a value for lane zero of an instruction
6178     // marked uniform after vectorization, rather than VF identical values.
6179     // Thus, if we scalarize an instruction that uses a uniform, we would
6180     // create uses of values corresponding to the lanes we aren't emitting code
6181     // for. This behavior can be changed by allowing getScalarValue to clone
6182     // the lane zero values for uniforms rather than asserting.
6183     for (Use &U : I->operands())
6184       if (auto *J = dyn_cast<Instruction>(U.get()))
6185         if (isUniformAfterVectorization(J, VF))
6186           return false;
6187 
6188     // Otherwise, we can scalarize the instruction.
6189     return true;
6190   };
6191 
6192   // Compute the expected cost discount from scalarizing the entire expression
6193   // feeding the predicated instruction. We currently only consider expressions
6194   // that are single-use instruction chains.
6195   Worklist.push_back(PredInst);
6196   while (!Worklist.empty()) {
6197     Instruction *I = Worklist.pop_back_val();
6198 
6199     // If we've already analyzed the instruction, there's nothing to do.
6200     if (ScalarCosts.find(I) != ScalarCosts.end())
6201       continue;
6202 
6203     // Compute the cost of the vector instruction. Note that this cost already
6204     // includes the scalarization overhead of the predicated instruction.
6205     InstructionCost VectorCost = getInstructionCost(I, VF).first;
6206 
6207     // Compute the cost of the scalarized instruction. This cost is the cost of
6208     // the instruction as if it wasn't if-converted and instead remained in the
6209     // predicated block. We will scale this cost by block probability after
6210     // computing the scalarization overhead.
6211     InstructionCost ScalarCost =
6212         VF.getFixedValue() *
6213         getInstructionCost(I, ElementCount::getFixed(1)).first;
6214 
6215     // Compute the scalarization overhead of needed insertelement instructions
6216     // and phi nodes.
6217     if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
6218       ScalarCost += TTI.getScalarizationOverhead(
6219           cast<VectorType>(ToVectorTy(I->getType(), VF)),
6220           APInt::getAllOnes(VF.getFixedValue()), true, false);
6221       ScalarCost +=
6222           VF.getFixedValue() *
6223           TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
6224     }
6225 
6226     // Compute the scalarization overhead of needed extractelement
6227     // instructions. For each of the instruction's operands, if the operand can
6228     // be scalarized, add it to the worklist; otherwise, account for the
6229     // overhead.
6230     for (Use &U : I->operands())
6231       if (auto *J = dyn_cast<Instruction>(U.get())) {
6232         assert(VectorType::isValidElementType(J->getType()) &&
6233                "Instruction has non-scalar type");
6234         if (canBeScalarized(J))
6235           Worklist.push_back(J);
6236         else if (needsExtract(J, VF)) {
6237           ScalarCost += TTI.getScalarizationOverhead(
6238               cast<VectorType>(ToVectorTy(J->getType(), VF)),
6239               APInt::getAllOnes(VF.getFixedValue()), false, true);
6240         }
6241       }
6242 
6243     // Scale the total scalar cost by block probability.
6244     ScalarCost /= getReciprocalPredBlockProb();
6245 
6246     // Compute the discount. A non-negative discount means the vector version
6247     // of the instruction costs more, and scalarizing would be beneficial.
6248     Discount += VectorCost - ScalarCost;
6249     ScalarCosts[I] = ScalarCost;
6250   }
6251 
6252   return *Discount.getValue();
6253 }
6254 
6255 LoopVectorizationCostModel::VectorizationCostTy
6256 LoopVectorizationCostModel::expectedCost(
6257     ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
6258   VectorizationCostTy Cost;
6259 
6260   // For each block.
6261   for (BasicBlock *BB : TheLoop->blocks()) {
6262     VectorizationCostTy BlockCost;
6263 
6264     // For each instruction in the old loop.
6265     for (Instruction &I : BB->instructionsWithoutDebug()) {
6266       // Skip ignored values.
6267       if (ValuesToIgnore.count(&I) ||
6268           (VF.isVector() && VecValuesToIgnore.count(&I)))
6269         continue;
6270 
6271       VectorizationCostTy C = getInstructionCost(&I, VF);
6272 
6273       // Check if we should override the cost.
6274       if (C.first.isValid() &&
6275           ForceTargetInstructionCost.getNumOccurrences() > 0)
6276         C.first = InstructionCost(ForceTargetInstructionCost);
6277 
6278       // Keep a list of instructions with invalid costs.
6279       if (Invalid && !C.first.isValid())
6280         Invalid->emplace_back(&I, VF);
6281 
6282       BlockCost.first += C.first;
6283       BlockCost.second |= C.second;
6284       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6285                         << " for VF " << VF << " For instruction: " << I
6286                         << '\n');
6287     }
6288 
6289     // If we are vectorizing a predicated block, it will have been
6290     // if-converted. This means that the block's instructions (aside from
6291     // stores and instructions that may divide by zero) will now be
6292     // unconditionally executed. For the scalar case, we may not always execute
6293     // the predicated block, if it is an if-else block. Thus, scale the block's
6294     // cost by the probability of executing it. blockNeedsPredication from
6295     // Legal is used so as to not include all blocks in tail folded loops.
6296     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
6297       BlockCost.first /= getReciprocalPredBlockProb();
6298 
6299     Cost.first += BlockCost.first;
6300     Cost.second |= BlockCost.second;
6301   }
6302 
6303   return Cost;
6304 }
6305 
6306 /// Gets Address Access SCEV after verifying that the access pattern
6307 /// is loop invariant except the induction variable dependence.
6308 ///
6309 /// This SCEV can be sent to the Target in order to estimate the address
6310 /// calculation cost.
6311 static const SCEV *getAddressAccessSCEV(
6312               Value *Ptr,
6313               LoopVectorizationLegality *Legal,
6314               PredicatedScalarEvolution &PSE,
6315               const Loop *TheLoop) {
6316 
6317   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6318   if (!Gep)
6319     return nullptr;
6320 
6321   // We are looking for a gep with all loop invariant indices except for one
6322   // which should be an induction variable.
6323   auto SE = PSE.getSE();
6324   unsigned NumOperands = Gep->getNumOperands();
6325   for (unsigned i = 1; i < NumOperands; ++i) {
6326     Value *Opd = Gep->getOperand(i);
6327     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6328         !Legal->isInductionVariable(Opd))
6329       return nullptr;
6330   }
6331 
6332   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6333   return PSE.getSCEV(Ptr);
6334 }
6335 
6336 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6337   return Legal->hasStride(I->getOperand(0)) ||
6338          Legal->hasStride(I->getOperand(1));
6339 }
6340 
6341 InstructionCost
6342 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6343                                                         ElementCount VF) {
6344   assert(VF.isVector() &&
6345          "Scalarization cost of instruction implies vectorization.");
6346   if (VF.isScalable())
6347     return InstructionCost::getInvalid();
6348 
6349   Type *ValTy = getLoadStoreType(I);
6350   auto SE = PSE.getSE();
6351 
6352   unsigned AS = getLoadStoreAddressSpace(I);
6353   Value *Ptr = getLoadStorePointerOperand(I);
6354   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6355   // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
6356   //       that it is being called from this specific place.
6357 
6358   // Figure out whether the access is strided and get the stride value
6359   // if it's known in compile time
6360   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6361 
6362   // Get the cost of the scalar memory instruction and address computation.
6363   InstructionCost Cost =
6364       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6365 
6366   // Don't pass *I here, since it is scalar but will actually be part of a
6367   // vectorized loop where the user of it is a vectorized instruction.
6368   const Align Alignment = getLoadStoreAlignment(I);
6369   Cost += VF.getKnownMinValue() *
6370           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
6371                               AS, TTI::TCK_RecipThroughput);
6372 
6373   // Get the overhead of the extractelement and insertelement instructions
6374   // we might create due to scalarization.
6375   Cost += getScalarizationOverhead(I, VF);
6376 
6377   // If we have a predicated load/store, it will need extra i1 extracts and
6378   // conditional branches, but may not be executed for each vector lane. Scale
6379   // the cost by the probability of executing the predicated block.
6380   if (isPredicatedInst(I, VF)) {
6381     Cost /= getReciprocalPredBlockProb();
6382 
6383     // Add the cost of an i1 extract and a branch
6384     auto *Vec_i1Ty =
6385         VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
6386     Cost += TTI.getScalarizationOverhead(
6387         Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
6388         /*Insert=*/false, /*Extract=*/true);
6389     Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput);
6390 
6391     if (useEmulatedMaskMemRefHack(I, VF))
6392       // Artificially setting to a high enough value to practically disable
6393       // vectorization with such operations.
6394       Cost = 3000000;
6395   }
6396 
6397   return Cost;
6398 }
6399 
6400 InstructionCost
6401 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6402                                                     ElementCount VF) {
6403   Type *ValTy = getLoadStoreType(I);
6404   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6405   Value *Ptr = getLoadStorePointerOperand(I);
6406   unsigned AS = getLoadStoreAddressSpace(I);
6407   int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
6408   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6409 
6410   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6411          "Stride should be 1 or -1 for consecutive memory access");
6412   const Align Alignment = getLoadStoreAlignment(I);
6413   InstructionCost Cost = 0;
6414   if (Legal->isMaskRequired(I))
6415     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6416                                       CostKind);
6417   else
6418     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6419                                 CostKind, I);
6420 
6421   bool Reverse = ConsecutiveStride < 0;
6422   if (Reverse)
6423     Cost +=
6424         TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
6425   return Cost;
6426 }
6427 
6428 InstructionCost
6429 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6430                                                 ElementCount VF) {
6431   assert(Legal->isUniformMemOp(*I));
6432 
6433   Type *ValTy = getLoadStoreType(I);
6434   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6435   const Align Alignment = getLoadStoreAlignment(I);
6436   unsigned AS = getLoadStoreAddressSpace(I);
6437   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6438   if (isa<LoadInst>(I)) {
6439     return TTI.getAddressComputationCost(ValTy) +
6440            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6441                                CostKind) +
6442            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6443   }
6444   StoreInst *SI = cast<StoreInst>(I);
6445 
6446   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6447   return TTI.getAddressComputationCost(ValTy) +
6448          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6449                              CostKind) +
6450          (isLoopInvariantStoreValue
6451               ? 0
6452               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6453                                        VF.getKnownMinValue() - 1));
6454 }
6455 
6456 InstructionCost
6457 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6458                                                  ElementCount VF) {
6459   Type *ValTy = getLoadStoreType(I);
6460   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6461   const Align Alignment = getLoadStoreAlignment(I);
6462   const Value *Ptr = getLoadStorePointerOperand(I);
6463 
6464   return TTI.getAddressComputationCost(VectorTy) +
6465          TTI.getGatherScatterOpCost(
6466              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6467              TargetTransformInfo::TCK_RecipThroughput, I);
6468 }
6469 
6470 InstructionCost
6471 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6472                                                    ElementCount VF) {
6473   // TODO: Once we have support for interleaving with scalable vectors
6474   // we can calculate the cost properly here.
6475   if (VF.isScalable())
6476     return InstructionCost::getInvalid();
6477 
6478   Type *ValTy = getLoadStoreType(I);
6479   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6480   unsigned AS = getLoadStoreAddressSpace(I);
6481 
6482   auto Group = getInterleavedAccessGroup(I);
6483   assert(Group && "Fail to get an interleaved access group.");
6484 
6485   unsigned InterleaveFactor = Group->getFactor();
6486   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6487 
6488   // Holds the indices of existing members in the interleaved group.
6489   SmallVector<unsigned, 4> Indices;
6490   for (unsigned IF = 0; IF < InterleaveFactor; IF++)
6491     if (Group->getMember(IF))
6492       Indices.push_back(IF);
6493 
6494   // Calculate the cost of the whole interleaved group.
6495   bool UseMaskForGaps =
6496       (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
6497       (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
6498   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
6499       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6500       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
6501 
6502   if (Group->isReverse()) {
6503     // TODO: Add support for reversed masked interleaved access.
6504     assert(!Legal->isMaskRequired(I) &&
6505            "Reverse masked interleaved access not supported.");
6506     Cost +=
6507         Group->getNumMembers() *
6508         TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
6509   }
6510   return Cost;
6511 }
6512 
6513 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(
6514     Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) {
6515   using namespace llvm::PatternMatch;
6516   // Early exit for no inloop reductions
6517   if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty))
6518     return None;
6519   auto *VectorTy = cast<VectorType>(Ty);
6520 
6521   // We are looking for a pattern of, and finding the minimal acceptable cost:
6522   //  reduce(mul(ext(A), ext(B))) or
6523   //  reduce(mul(A, B)) or
6524   //  reduce(ext(A)) or
6525   //  reduce(A).
6526   // The basic idea is that we walk down the tree to do that, finding the root
6527   // reduction instruction in InLoopReductionImmediateChains. From there we find
6528   // the pattern of mul/ext and test the cost of the entire pattern vs the cost
6529   // of the components. If the reduction cost is lower then we return it for the
6530   // reduction instruction and 0 for the other instructions in the pattern. If
6531   // it is not we return an invalid cost specifying the orignal cost method
6532   // should be used.
6533   Instruction *RetI = I;
6534   if (match(RetI, m_ZExtOrSExt(m_Value()))) {
6535     if (!RetI->hasOneUser())
6536       return None;
6537     RetI = RetI->user_back();
6538   }
6539   if (match(RetI, m_Mul(m_Value(), m_Value())) &&
6540       RetI->user_back()->getOpcode() == Instruction::Add) {
6541     if (!RetI->hasOneUser())
6542       return None;
6543     RetI = RetI->user_back();
6544   }
6545 
6546   // Test if the found instruction is a reduction, and if not return an invalid
6547   // cost specifying the parent to use the original cost modelling.
6548   if (!InLoopReductionImmediateChains.count(RetI))
6549     return None;
6550 
6551   // Find the reduction this chain is a part of and calculate the basic cost of
6552   // the reduction on its own.
6553   Instruction *LastChain = InLoopReductionImmediateChains[RetI];
6554   Instruction *ReductionPhi = LastChain;
6555   while (!isa<PHINode>(ReductionPhi))
6556     ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];
6557 
6558   const RecurrenceDescriptor &RdxDesc =
6559       Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
6560 
6561   InstructionCost BaseCost = TTI.getArithmeticReductionCost(
6562       RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
6563 
6564   // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
6565   // normal fmul instruction to the cost of the fadd reduction.
6566   if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)
6567     BaseCost +=
6568         TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
6569 
6570   // If we're using ordered reductions then we can just return the base cost
6571   // here, since getArithmeticReductionCost calculates the full ordered
6572   // reduction cost when FP reassociation is not allowed.
6573   if (useOrderedReductions(RdxDesc))
6574     return BaseCost;
6575 
6576   // Get the operand that was not the reduction chain and match it to one of the
6577   // patterns, returning the better cost if it is found.
6578   Instruction *RedOp = RetI->getOperand(1) == LastChain
6579                            ? dyn_cast<Instruction>(RetI->getOperand(0))
6580                            : dyn_cast<Instruction>(RetI->getOperand(1));
6581 
6582   VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
6583 
6584   Instruction *Op0, *Op1;
6585   if (RedOp &&
6586       match(RedOp,
6587             m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
6588       match(Op0, m_ZExtOrSExt(m_Value())) &&
6589       Op0->getOpcode() == Op1->getOpcode() &&
6590       Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
6591       !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
6592       (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
6593 
6594     // Matched reduce(ext(mul(ext(A), ext(B)))
6595     // Note that the extend opcodes need to all match, or if A==B they will have
6596     // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
6597     // which is equally fine.
6598     bool IsUnsigned = isa<ZExtInst>(Op0);
6599     auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
6600     auto *MulType = VectorType::get(Op0->getType(), VectorTy);
6601 
6602     InstructionCost ExtCost =
6603         TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
6604                              TTI::CastContextHint::None, CostKind, Op0);
6605     InstructionCost MulCost =
6606         TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
6607     InstructionCost Ext2Cost =
6608         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
6609                              TTI::CastContextHint::None, CostKind, RedOp);
6610 
6611     InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6612         /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6613         CostKind);
6614 
6615     if (RedCost.isValid() &&
6616         RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
6617       return I == RetI ? RedCost : 0;
6618   } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
6619              !TheLoop->isLoopInvariant(RedOp)) {
6620     // Matched reduce(ext(A))
6621     bool IsUnsigned = isa<ZExtInst>(RedOp);
6622     auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
6623     InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6624         /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6625         CostKind);
6626 
6627     InstructionCost ExtCost =
6628         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
6629                              TTI::CastContextHint::None, CostKind, RedOp);
6630     if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
6631       return I == RetI ? RedCost : 0;
6632   } else if (RedOp &&
6633              match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
6634     if (match(Op0, m_ZExtOrSExt(m_Value())) &&
6635         Op0->getOpcode() == Op1->getOpcode() &&
6636         !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
6637       bool IsUnsigned = isa<ZExtInst>(Op0);
6638       Type *Op0Ty = Op0->getOperand(0)->getType();
6639       Type *Op1Ty = Op1->getOperand(0)->getType();
6640       Type *LargestOpTy =
6641           Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
6642                                                                     : Op0Ty;
6643       auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
6644 
6645       // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of
6646       // different sizes. We take the largest type as the ext to reduce, and add
6647       // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
6648       InstructionCost ExtCost0 = TTI.getCastInstrCost(
6649           Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
6650           TTI::CastContextHint::None, CostKind, Op0);
6651       InstructionCost ExtCost1 = TTI.getCastInstrCost(
6652           Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
6653           TTI::CastContextHint::None, CostKind, Op1);
6654       InstructionCost MulCost =
6655           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6656 
6657       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6658           /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6659           CostKind);
6660       InstructionCost ExtraExtCost = 0;
6661       if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
6662         Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
6663         ExtraExtCost = TTI.getCastInstrCost(
6664             ExtraExtOp->getOpcode(), ExtType,
6665             VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
6666             TTI::CastContextHint::None, CostKind, ExtraExtOp);
6667       }
6668 
6669       if (RedCost.isValid() &&
6670           (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
6671         return I == RetI ? RedCost : 0;
6672     } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
6673       // Matched reduce(mul())
6674       InstructionCost MulCost =
6675           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6676 
6677       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6678           /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy,
6679           CostKind);
6680 
6681       if (RedCost.isValid() && RedCost < MulCost + BaseCost)
6682         return I == RetI ? RedCost : 0;
6683     }
6684   }
6685 
6686   return I == RetI ? Optional<InstructionCost>(BaseCost) : None;
6687 }
6688 
6689 InstructionCost
6690 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6691                                                      ElementCount VF) {
6692   // Calculate scalar cost only. Vectorization cost should be ready at this
6693   // moment.
6694   if (VF.isScalar()) {
6695     Type *ValTy = getLoadStoreType(I);
6696     const Align Alignment = getLoadStoreAlignment(I);
6697     unsigned AS = getLoadStoreAddressSpace(I);
6698 
6699     return TTI.getAddressComputationCost(ValTy) +
6700            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6701                                TTI::TCK_RecipThroughput, I);
6702   }
6703   return getWideningCost(I, VF);
6704 }
6705 
6706 LoopVectorizationCostModel::VectorizationCostTy
6707 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6708                                                ElementCount VF) {
6709   // If we know that this instruction will remain uniform, check the cost of
6710   // the scalar version.
6711   if (isUniformAfterVectorization(I, VF))
6712     VF = ElementCount::getFixed(1);
6713 
6714   if (VF.isVector() && isProfitableToScalarize(I, VF))
6715     return VectorizationCostTy(InstsToScalarize[VF][I], false);
6716 
6717   // Forced scalars do not have any scalarization overhead.
6718   auto ForcedScalar = ForcedScalars.find(VF);
6719   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6720     auto InstSet = ForcedScalar->second;
6721     if (InstSet.count(I))
6722       return VectorizationCostTy(
6723           (getInstructionCost(I, ElementCount::getFixed(1)).first *
6724            VF.getKnownMinValue()),
6725           false);
6726   }
6727 
6728   Type *VectorTy;
6729   InstructionCost C = getInstructionCost(I, VF, VectorTy);
6730 
6731   bool TypeNotScalarized = false;
6732   if (VF.isVector() && VectorTy->isVectorTy()) {
6733     unsigned NumParts = TTI.getNumberOfParts(VectorTy);
6734     if (NumParts)
6735       TypeNotScalarized = NumParts < VF.getKnownMinValue();
6736     else
6737       C = InstructionCost::getInvalid();
6738   }
6739   return VectorizationCostTy(C, TypeNotScalarized);
6740 }
6741 
6742 InstructionCost
6743 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
6744                                                      ElementCount VF) const {
6745 
6746   // There is no mechanism yet to create a scalable scalarization loop,
6747   // so this is currently Invalid.
6748   if (VF.isScalable())
6749     return InstructionCost::getInvalid();
6750 
6751   if (VF.isScalar())
6752     return 0;
6753 
6754   InstructionCost Cost = 0;
6755   Type *RetTy = ToVectorTy(I->getType(), VF);
6756   if (!RetTy->isVoidTy() &&
6757       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6758     Cost += TTI.getScalarizationOverhead(
6759         cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true,
6760         false);
6761 
6762   // Some targets keep addresses scalar.
6763   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6764     return Cost;
6765 
6766   // Some targets support efficient element stores.
6767   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6768     return Cost;
6769 
6770   // Collect operands to consider.
6771   CallInst *CI = dyn_cast<CallInst>(I);
6772   Instruction::op_range Ops = CI ? CI->args() : I->operands();
6773 
6774   // Skip operands that do not require extraction/scalarization and do not incur
6775   // any overhead.
6776   SmallVector<Type *> Tys;
6777   for (auto *V : filterExtractingOperands(Ops, VF))
6778     Tys.push_back(MaybeVectorizeType(V->getType(), VF));
6779   return Cost + TTI.getOperandsScalarizationOverhead(
6780                     filterExtractingOperands(Ops, VF), Tys);
6781 }
6782 
6783 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6784   if (VF.isScalar())
6785     return;
6786   NumPredStores = 0;
6787   for (BasicBlock *BB : TheLoop->blocks()) {
6788     // For each instruction in the old loop.
6789     for (Instruction &I : *BB) {
6790       Value *Ptr =  getLoadStorePointerOperand(&I);
6791       if (!Ptr)
6792         continue;
6793 
6794       // TODO: We should generate better code and update the cost model for
6795       // predicated uniform stores. Today they are treated as any other
6796       // predicated store (see added test cases in
6797       // invariant-store-vectorization.ll).
6798       if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
6799         NumPredStores++;
6800 
6801       if (Legal->isUniformMemOp(I)) {
6802         // TODO: Avoid replicating loads and stores instead of
6803         // relying on instcombine to remove them.
6804         // Load: Scalar load + broadcast
6805         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6806         InstructionCost Cost;
6807         if (isa<StoreInst>(&I) && VF.isScalable() &&
6808             isLegalGatherOrScatter(&I, VF)) {
6809           Cost = getGatherScatterCost(&I, VF);
6810           setWideningDecision(&I, VF, CM_GatherScatter, Cost);
6811         } else {
6812           assert((isa<LoadInst>(&I) || !VF.isScalable()) &&
6813                  "Cannot yet scalarize uniform stores");
6814           Cost = getUniformMemOpCost(&I, VF);
6815           setWideningDecision(&I, VF, CM_Scalarize, Cost);
6816         }
6817         continue;
6818       }
6819 
6820       // We assume that widening is the best solution when possible.
6821       if (memoryInstructionCanBeWidened(&I, VF)) {
6822         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6823         int ConsecutiveStride = Legal->isConsecutivePtr(
6824             getLoadStoreType(&I), getLoadStorePointerOperand(&I));
6825         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6826                "Expected consecutive stride.");
6827         InstWidening Decision =
6828             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6829         setWideningDecision(&I, VF, Decision, Cost);
6830         continue;
6831       }
6832 
6833       // Choose between Interleaving, Gather/Scatter or Scalarization.
6834       InstructionCost InterleaveCost = InstructionCost::getInvalid();
6835       unsigned NumAccesses = 1;
6836       if (isAccessInterleaved(&I)) {
6837         auto Group = getInterleavedAccessGroup(&I);
6838         assert(Group && "Fail to get an interleaved access group.");
6839 
6840         // Make one decision for the whole group.
6841         if (getWideningDecision(&I, VF) != CM_Unknown)
6842           continue;
6843 
6844         NumAccesses = Group->getNumMembers();
6845         if (interleavedAccessCanBeWidened(&I, VF))
6846           InterleaveCost = getInterleaveGroupCost(&I, VF);
6847       }
6848 
6849       InstructionCost GatherScatterCost =
6850           isLegalGatherOrScatter(&I, VF)
6851               ? getGatherScatterCost(&I, VF) * NumAccesses
6852               : InstructionCost::getInvalid();
6853 
6854       InstructionCost ScalarizationCost =
6855           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6856 
6857       // Choose better solution for the current VF,
6858       // write down this decision and use it during vectorization.
6859       InstructionCost Cost;
6860       InstWidening Decision;
6861       if (InterleaveCost <= GatherScatterCost &&
6862           InterleaveCost < ScalarizationCost) {
6863         Decision = CM_Interleave;
6864         Cost = InterleaveCost;
6865       } else if (GatherScatterCost < ScalarizationCost) {
6866         Decision = CM_GatherScatter;
6867         Cost = GatherScatterCost;
6868       } else {
6869         Decision = CM_Scalarize;
6870         Cost = ScalarizationCost;
6871       }
6872       // If the instructions belongs to an interleave group, the whole group
6873       // receives the same decision. The whole group receives the cost, but
6874       // the cost will actually be assigned to one instruction.
6875       if (auto Group = getInterleavedAccessGroup(&I))
6876         setWideningDecision(Group, VF, Decision, Cost);
6877       else
6878         setWideningDecision(&I, VF, Decision, Cost);
6879     }
6880   }
6881 
6882   // Make sure that any load of address and any other address computation
6883   // remains scalar unless there is gather/scatter support. This avoids
6884   // inevitable extracts into address registers, and also has the benefit of
6885   // activating LSR more, since that pass can't optimize vectorized
6886   // addresses.
6887   if (TTI.prefersVectorizedAddressing())
6888     return;
6889 
6890   // Start with all scalar pointer uses.
6891   SmallPtrSet<Instruction *, 8> AddrDefs;
6892   for (BasicBlock *BB : TheLoop->blocks())
6893     for (Instruction &I : *BB) {
6894       Instruction *PtrDef =
6895         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6896       if (PtrDef && TheLoop->contains(PtrDef) &&
6897           getWideningDecision(&I, VF) != CM_GatherScatter)
6898         AddrDefs.insert(PtrDef);
6899     }
6900 
6901   // Add all instructions used to generate the addresses.
6902   SmallVector<Instruction *, 4> Worklist;
6903   append_range(Worklist, AddrDefs);
6904   while (!Worklist.empty()) {
6905     Instruction *I = Worklist.pop_back_val();
6906     for (auto &Op : I->operands())
6907       if (auto *InstOp = dyn_cast<Instruction>(Op))
6908         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6909             AddrDefs.insert(InstOp).second)
6910           Worklist.push_back(InstOp);
6911   }
6912 
6913   for (auto *I : AddrDefs) {
6914     if (isa<LoadInst>(I)) {
6915       // Setting the desired widening decision should ideally be handled in
6916       // by cost functions, but since this involves the task of finding out
6917       // if the loaded register is involved in an address computation, it is
6918       // instead changed here when we know this is the case.
6919       InstWidening Decision = getWideningDecision(I, VF);
6920       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6921         // Scalarize a widened load of address.
6922         setWideningDecision(
6923             I, VF, CM_Scalarize,
6924             (VF.getKnownMinValue() *
6925              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6926       else if (auto Group = getInterleavedAccessGroup(I)) {
6927         // Scalarize an interleave group of address loads.
6928         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6929           if (Instruction *Member = Group->getMember(I))
6930             setWideningDecision(
6931                 Member, VF, CM_Scalarize,
6932                 (VF.getKnownMinValue() *
6933                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6934         }
6935       }
6936     } else
6937       // Make sure I gets scalarized and a cost estimate without
6938       // scalarization overhead.
6939       ForcedScalars[VF].insert(I);
6940   }
6941 }
6942 
6943 InstructionCost
6944 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
6945                                                Type *&VectorTy) {
6946   Type *RetTy = I->getType();
6947   if (canTruncateToMinimalBitwidth(I, VF))
6948     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6949   auto SE = PSE.getSE();
6950   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6951 
6952   auto hasSingleCopyAfterVectorization = [this](Instruction *I,
6953                                                 ElementCount VF) -> bool {
6954     if (VF.isScalar())
6955       return true;
6956 
6957     auto Scalarized = InstsToScalarize.find(VF);
6958     assert(Scalarized != InstsToScalarize.end() &&
6959            "VF not yet analyzed for scalarization profitability");
6960     return !Scalarized->second.count(I) &&
6961            llvm::all_of(I->users(), [&](User *U) {
6962              auto *UI = cast<Instruction>(U);
6963              return !Scalarized->second.count(UI);
6964            });
6965   };
6966   (void) hasSingleCopyAfterVectorization;
6967 
6968   if (isScalarAfterVectorization(I, VF)) {
6969     // With the exception of GEPs and PHIs, after scalarization there should
6970     // only be one copy of the instruction generated in the loop. This is
6971     // because the VF is either 1, or any instructions that need scalarizing
6972     // have already been dealt with by the the time we get here. As a result,
6973     // it means we don't have to multiply the instruction cost by VF.
6974     assert(I->getOpcode() == Instruction::GetElementPtr ||
6975            I->getOpcode() == Instruction::PHI ||
6976            (I->getOpcode() == Instruction::BitCast &&
6977             I->getType()->isPointerTy()) ||
6978            hasSingleCopyAfterVectorization(I, VF));
6979     VectorTy = RetTy;
6980   } else
6981     VectorTy = ToVectorTy(RetTy, VF);
6982 
6983   // TODO: We need to estimate the cost of intrinsic calls.
6984   switch (I->getOpcode()) {
6985   case Instruction::GetElementPtr:
6986     // We mark this instruction as zero-cost because the cost of GEPs in
6987     // vectorized code depends on whether the corresponding memory instruction
6988     // is scalarized or not. Therefore, we handle GEPs with the memory
6989     // instruction cost.
6990     return 0;
6991   case Instruction::Br: {
6992     // In cases of scalarized and predicated instructions, there will be VF
6993     // predicated blocks in the vectorized loop. Each branch around these
6994     // blocks requires also an extract of its vector compare i1 element.
6995     bool ScalarPredicatedBB = false;
6996     BranchInst *BI = cast<BranchInst>(I);
6997     if (VF.isVector() && BI->isConditional() &&
6998         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
6999          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
7000       ScalarPredicatedBB = true;
7001 
7002     if (ScalarPredicatedBB) {
7003       // Not possible to scalarize scalable vector with predicated instructions.
7004       if (VF.isScalable())
7005         return InstructionCost::getInvalid();
7006       // Return cost for branches around scalarized and predicated blocks.
7007       auto *Vec_i1Ty =
7008           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
7009       return (
7010           TTI.getScalarizationOverhead(
7011               Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) +
7012           (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
7013     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
7014       // The back-edge branch will remain, as will all scalar branches.
7015       return TTI.getCFInstrCost(Instruction::Br, CostKind);
7016     else
7017       // This branch will be eliminated by if-conversion.
7018       return 0;
7019     // Note: We currently assume zero cost for an unconditional branch inside
7020     // a predicated block since it will become a fall-through, although we
7021     // may decide in the future to call TTI for all branches.
7022   }
7023   case Instruction::PHI: {
7024     auto *Phi = cast<PHINode>(I);
7025 
7026     // First-order recurrences are replaced by vector shuffles inside the loop.
7027     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
7028     if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
7029       return TTI.getShuffleCost(
7030           TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
7031           None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
7032 
7033     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
7034     // converted into select instructions. We require N - 1 selects per phi
7035     // node, where N is the number of incoming values.
7036     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
7037       return (Phi->getNumIncomingValues() - 1) *
7038              TTI.getCmpSelInstrCost(
7039                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
7040                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
7041                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
7042 
7043     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
7044   }
7045   case Instruction::UDiv:
7046   case Instruction::SDiv:
7047   case Instruction::URem:
7048   case Instruction::SRem:
7049     // If we have a predicated instruction, it may not be executed for each
7050     // vector lane. Get the scalarization cost and scale this amount by the
7051     // probability of executing the predicated block. If the instruction is not
7052     // predicated, we fall through to the next case.
7053     if (VF.isVector() && isScalarWithPredication(I, VF)) {
7054       InstructionCost Cost = 0;
7055 
7056       // These instructions have a non-void type, so account for the phi nodes
7057       // that we will create. This cost is likely to be zero. The phi node
7058       // cost, if any, should be scaled by the block probability because it
7059       // models a copy at the end of each predicated block.
7060       Cost += VF.getKnownMinValue() *
7061               TTI.getCFInstrCost(Instruction::PHI, CostKind);
7062 
7063       // The cost of the non-predicated instruction.
7064       Cost += VF.getKnownMinValue() *
7065               TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
7066 
7067       // The cost of insertelement and extractelement instructions needed for
7068       // scalarization.
7069       Cost += getScalarizationOverhead(I, VF);
7070 
7071       // Scale the cost by the probability of executing the predicated blocks.
7072       // This assumes the predicated block for each vector lane is equally
7073       // likely.
7074       return Cost / getReciprocalPredBlockProb();
7075     }
7076     LLVM_FALLTHROUGH;
7077   case Instruction::Add:
7078   case Instruction::FAdd:
7079   case Instruction::Sub:
7080   case Instruction::FSub:
7081   case Instruction::Mul:
7082   case Instruction::FMul:
7083   case Instruction::FDiv:
7084   case Instruction::FRem:
7085   case Instruction::Shl:
7086   case Instruction::LShr:
7087   case Instruction::AShr:
7088   case Instruction::And:
7089   case Instruction::Or:
7090   case Instruction::Xor: {
7091     // Since we will replace the stride by 1 the multiplication should go away.
7092     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7093       return 0;
7094 
7095     // Detect reduction patterns
7096     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7097       return *RedCost;
7098 
7099     // Certain instructions can be cheaper to vectorize if they have a constant
7100     // second vector operand. One example of this are shifts on x86.
7101     Value *Op2 = I->getOperand(1);
7102     TargetTransformInfo::OperandValueProperties Op2VP;
7103     TargetTransformInfo::OperandValueKind Op2VK =
7104         TTI.getOperandInfo(Op2, Op2VP);
7105     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
7106       Op2VK = TargetTransformInfo::OK_UniformValue;
7107 
7108     SmallVector<const Value *, 4> Operands(I->operand_values());
7109     return TTI.getArithmeticInstrCost(
7110         I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7111         Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
7112   }
7113   case Instruction::FNeg: {
7114     return TTI.getArithmeticInstrCost(
7115         I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7116         TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None,
7117         TargetTransformInfo::OP_None, I->getOperand(0), I);
7118   }
7119   case Instruction::Select: {
7120     SelectInst *SI = cast<SelectInst>(I);
7121     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7122     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7123 
7124     const Value *Op0, *Op1;
7125     using namespace llvm::PatternMatch;
7126     if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
7127                         match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
7128       // select x, y, false --> x & y
7129       // select x, true, y --> x | y
7130       TTI::OperandValueProperties Op1VP = TTI::OP_None;
7131       TTI::OperandValueProperties Op2VP = TTI::OP_None;
7132       TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP);
7133       TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP);
7134       assert(Op0->getType()->getScalarSizeInBits() == 1 &&
7135               Op1->getType()->getScalarSizeInBits() == 1);
7136 
7137       SmallVector<const Value *, 2> Operands{Op0, Op1};
7138       return TTI.getArithmeticInstrCost(
7139           match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
7140           CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I);
7141     }
7142 
7143     Type *CondTy = SI->getCondition()->getType();
7144     if (!ScalarCond)
7145       CondTy = VectorType::get(CondTy, VF);
7146 
7147     CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
7148     if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
7149       Pred = Cmp->getPredicate();
7150     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
7151                                   CostKind, I);
7152   }
7153   case Instruction::ICmp:
7154   case Instruction::FCmp: {
7155     Type *ValTy = I->getOperand(0)->getType();
7156     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7157     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7158       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7159     VectorTy = ToVectorTy(ValTy, VF);
7160     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7161                                   cast<CmpInst>(I)->getPredicate(), CostKind,
7162                                   I);
7163   }
7164   case Instruction::Store:
7165   case Instruction::Load: {
7166     ElementCount Width = VF;
7167     if (Width.isVector()) {
7168       InstWidening Decision = getWideningDecision(I, Width);
7169       assert(Decision != CM_Unknown &&
7170              "CM decision should be taken at this point");
7171       if (Decision == CM_Scalarize)
7172         Width = ElementCount::getFixed(1);
7173     }
7174     VectorTy = ToVectorTy(getLoadStoreType(I), Width);
7175     return getMemoryInstructionCost(I, VF);
7176   }
7177   case Instruction::BitCast:
7178     if (I->getType()->isPointerTy())
7179       return 0;
7180     LLVM_FALLTHROUGH;
7181   case Instruction::ZExt:
7182   case Instruction::SExt:
7183   case Instruction::FPToUI:
7184   case Instruction::FPToSI:
7185   case Instruction::FPExt:
7186   case Instruction::PtrToInt:
7187   case Instruction::IntToPtr:
7188   case Instruction::SIToFP:
7189   case Instruction::UIToFP:
7190   case Instruction::Trunc:
7191   case Instruction::FPTrunc: {
7192     // Computes the CastContextHint from a Load/Store instruction.
7193     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7194       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7195              "Expected a load or a store!");
7196 
7197       if (VF.isScalar() || !TheLoop->contains(I))
7198         return TTI::CastContextHint::Normal;
7199 
7200       switch (getWideningDecision(I, VF)) {
7201       case LoopVectorizationCostModel::CM_GatherScatter:
7202         return TTI::CastContextHint::GatherScatter;
7203       case LoopVectorizationCostModel::CM_Interleave:
7204         return TTI::CastContextHint::Interleave;
7205       case LoopVectorizationCostModel::CM_Scalarize:
7206       case LoopVectorizationCostModel::CM_Widen:
7207         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7208                                         : TTI::CastContextHint::Normal;
7209       case LoopVectorizationCostModel::CM_Widen_Reverse:
7210         return TTI::CastContextHint::Reversed;
7211       case LoopVectorizationCostModel::CM_Unknown:
7212         llvm_unreachable("Instr did not go through cost modelling?");
7213       }
7214 
7215       llvm_unreachable("Unhandled case!");
7216     };
7217 
7218     unsigned Opcode = I->getOpcode();
7219     TTI::CastContextHint CCH = TTI::CastContextHint::None;
7220     // For Trunc, the context is the only user, which must be a StoreInst.
7221     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7222       if (I->hasOneUse())
7223         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7224           CCH = ComputeCCH(Store);
7225     }
7226     // For Z/Sext, the context is the operand, which must be a LoadInst.
7227     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7228              Opcode == Instruction::FPExt) {
7229       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7230         CCH = ComputeCCH(Load);
7231     }
7232 
7233     // We optimize the truncation of induction variables having constant
7234     // integer steps. The cost of these truncations is the same as the scalar
7235     // operation.
7236     if (isOptimizableIVTruncate(I, VF)) {
7237       auto *Trunc = cast<TruncInst>(I);
7238       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7239                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
7240     }
7241 
7242     // Detect reduction patterns
7243     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7244       return *RedCost;
7245 
7246     Type *SrcScalarTy = I->getOperand(0)->getType();
7247     Type *SrcVecTy =
7248         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7249     if (canTruncateToMinimalBitwidth(I, VF)) {
7250       // This cast is going to be shrunk. This may remove the cast or it might
7251       // turn it into slightly different cast. For example, if MinBW == 16,
7252       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7253       //
7254       // Calculate the modified src and dest types.
7255       Type *MinVecTy = VectorTy;
7256       if (Opcode == Instruction::Trunc) {
7257         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7258         VectorTy =
7259             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7260       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7261         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7262         VectorTy =
7263             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7264       }
7265     }
7266 
7267     return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7268   }
7269   case Instruction::Call: {
7270     if (RecurrenceDescriptor::isFMulAddIntrinsic(I))
7271       if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7272         return *RedCost;
7273     bool NeedToScalarize;
7274     CallInst *CI = cast<CallInst>(I);
7275     InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
7276     if (getVectorIntrinsicIDForCall(CI, TLI)) {
7277       InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
7278       return std::min(CallCost, IntrinsicCost);
7279     }
7280     return CallCost;
7281   }
7282   case Instruction::ExtractValue:
7283     return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7284   case Instruction::Alloca:
7285     // We cannot easily widen alloca to a scalable alloca, as
7286     // the result would need to be a vector of pointers.
7287     if (VF.isScalable())
7288       return InstructionCost::getInvalid();
7289     LLVM_FALLTHROUGH;
7290   default:
7291     // This opcode is unknown. Assume that it is the same as 'mul'.
7292     return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7293   } // end of switch.
7294 }
7295 
7296 char LoopVectorize::ID = 0;
7297 
7298 static const char lv_name[] = "Loop Vectorization";
7299 
7300 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
7301 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
7302 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
7303 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
7304 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
7305 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
7306 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
7307 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
7308 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
7309 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
7310 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
7311 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
7312 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
7313 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
7314 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
7315 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
7316 
7317 namespace llvm {
7318 
7319 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
7320 
7321 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
7322                               bool VectorizeOnlyWhenForced) {
7323   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
7324 }
7325 
7326 } // end namespace llvm
7327 
7328 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
7329   // Check if the pointer operand of a load or store instruction is
7330   // consecutive.
7331   if (auto *Ptr = getLoadStorePointerOperand(Inst))
7332     return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr);
7333   return false;
7334 }
7335 
7336 void LoopVectorizationCostModel::collectValuesToIgnore() {
7337   // Ignore ephemeral values.
7338   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7339 
7340   // Find all stores to invariant variables. Since they are going to sink
7341   // outside the loop we do not need calculate cost for them.
7342   for (BasicBlock *BB : TheLoop->blocks())
7343     for (Instruction &I : *BB) {
7344       StoreInst *SI;
7345       if ((SI = dyn_cast<StoreInst>(&I)) &&
7346           Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
7347         ValuesToIgnore.insert(&I);
7348     }
7349 
7350   // Ignore type-promoting instructions we identified during reduction
7351   // detection.
7352   for (auto &Reduction : Legal->getReductionVars()) {
7353     const RecurrenceDescriptor &RedDes = Reduction.second;
7354     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7355     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7356   }
7357   // Ignore type-casting instructions we identified during induction
7358   // detection.
7359   for (auto &Induction : Legal->getInductionVars()) {
7360     const InductionDescriptor &IndDes = Induction.second;
7361     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7362     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7363   }
7364 }
7365 
7366 void LoopVectorizationCostModel::collectInLoopReductions() {
7367   for (auto &Reduction : Legal->getReductionVars()) {
7368     PHINode *Phi = Reduction.first;
7369     const RecurrenceDescriptor &RdxDesc = Reduction.second;
7370 
7371     // We don't collect reductions that are type promoted (yet).
7372     if (RdxDesc.getRecurrenceType() != Phi->getType())
7373       continue;
7374 
7375     // If the target would prefer this reduction to happen "in-loop", then we
7376     // want to record it as such.
7377     unsigned Opcode = RdxDesc.getOpcode();
7378     if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7379         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7380                                    TargetTransformInfo::ReductionFlags()))
7381       continue;
7382 
7383     // Check that we can correctly put the reductions into the loop, by
7384     // finding the chain of operations that leads from the phi to the loop
7385     // exit value.
7386     SmallVector<Instruction *, 4> ReductionOperations =
7387         RdxDesc.getReductionOpChain(Phi, TheLoop);
7388     bool InLoop = !ReductionOperations.empty();
7389     if (InLoop) {
7390       InLoopReductionChains[Phi] = ReductionOperations;
7391       // Add the elements to InLoopReductionImmediateChains for cost modelling.
7392       Instruction *LastChain = Phi;
7393       for (auto *I : ReductionOperations) {
7394         InLoopReductionImmediateChains[I] = LastChain;
7395         LastChain = I;
7396       }
7397     }
7398     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7399                       << " reduction for phi: " << *Phi << "\n");
7400   }
7401 }
7402 
7403 // TODO: we could return a pair of values that specify the max VF and
7404 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7405 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7406 // doesn't have a cost model that can choose which plan to execute if
7407 // more than one is generated.
7408 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
7409                                  LoopVectorizationCostModel &CM) {
7410   unsigned WidestType;
7411   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7412   return WidestVectorRegBits / WidestType;
7413 }
7414 
7415 VectorizationFactor
7416 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7417   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
7418   ElementCount VF = UserVF;
7419   // Outer loop handling: They may require CFG and instruction level
7420   // transformations before even evaluating whether vectorization is profitable.
7421   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7422   // the vectorization pipeline.
7423   if (!OrigLoop->isInnermost()) {
7424     // If the user doesn't provide a vectorization factor, determine a
7425     // reasonable one.
7426     if (UserVF.isZero()) {
7427       VF = ElementCount::getFixed(determineVPlanVF(
7428           TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
7429               .getFixedSize(),
7430           CM));
7431       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7432 
7433       // Make sure we have a VF > 1 for stress testing.
7434       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7435         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7436                           << "overriding computed VF.\n");
7437         VF = ElementCount::getFixed(4);
7438       }
7439     }
7440     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7441     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7442            "VF needs to be a power of two");
7443     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7444                       << "VF " << VF << " to build VPlans.\n");
7445     buildVPlans(VF, VF);
7446 
7447     // For VPlan build stress testing, we bail out after VPlan construction.
7448     if (VPlanBuildStressTest)
7449       return VectorizationFactor::Disabled();
7450 
7451     return {VF, 0 /*Cost*/};
7452   }
7453 
7454   LLVM_DEBUG(
7455       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7456                 "VPlan-native path.\n");
7457   return VectorizationFactor::Disabled();
7458 }
7459 
7460 bool LoopVectorizationPlanner::requiresTooManyRuntimeChecks() const {
7461   unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks();
7462   return (NumRuntimePointerChecks >
7463               VectorizerParams::RuntimeMemoryCheckThreshold &&
7464           !Hints.allowReordering()) ||
7465          NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
7466 }
7467 
7468 Optional<VectorizationFactor>
7469 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7470   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7471   FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7472   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7473     return None;
7474 
7475   // Invalidate interleave groups if all blocks of loop will be predicated.
7476   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7477       !useMaskedInterleavedAccesses(*TTI)) {
7478     LLVM_DEBUG(
7479         dbgs()
7480         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7481            "which requires masked-interleaved support.\n");
7482     if (CM.InterleaveInfo.invalidateGroups())
7483       // Invalidating interleave groups also requires invalidating all decisions
7484       // based on them, which includes widening decisions and uniform and scalar
7485       // values.
7486       CM.invalidateCostModelingDecisions();
7487   }
7488 
7489   ElementCount MaxUserVF =
7490       UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7491   bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
7492   if (!UserVF.isZero() && UserVFIsLegal) {
7493     assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
7494            "VF needs to be a power of two");
7495     // Collect the instructions (and their associated costs) that will be more
7496     // profitable to scalarize.
7497     if (CM.selectUserVectorizationFactor(UserVF)) {
7498       LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7499       CM.collectInLoopReductions();
7500       buildVPlansWithVPRecipes(UserVF, UserVF);
7501       LLVM_DEBUG(printPlans(dbgs()));
7502       return {{UserVF, 0}};
7503     } else
7504       reportVectorizationInfo("UserVF ignored because of invalid costs.",
7505                               "InvalidCost", ORE, OrigLoop);
7506   }
7507 
7508   // Populate the set of Vectorization Factor Candidates.
7509   ElementCountSet VFCandidates;
7510   for (auto VF = ElementCount::getFixed(1);
7511        ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7512     VFCandidates.insert(VF);
7513   for (auto VF = ElementCount::getScalable(1);
7514        ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7515     VFCandidates.insert(VF);
7516 
7517   for (const auto &VF : VFCandidates) {
7518     // Collect Uniform and Scalar instructions after vectorization with VF.
7519     CM.collectUniformsAndScalars(VF);
7520 
7521     // Collect the instructions (and their associated costs) that will be more
7522     // profitable to scalarize.
7523     if (VF.isVector())
7524       CM.collectInstsToScalarize(VF);
7525   }
7526 
7527   CM.collectInLoopReductions();
7528   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7529   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7530 
7531   LLVM_DEBUG(printPlans(dbgs()));
7532   if (!MaxFactors.hasVector())
7533     return VectorizationFactor::Disabled();
7534 
7535   // Select the optimal vectorization factor.
7536   return CM.selectVectorizationFactor(VFCandidates);
7537 }
7538 
7539 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
7540   assert(count_if(VPlans,
7541                   [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
7542              1 &&
7543          "Best VF has not a single VPlan.");
7544 
7545   for (const VPlanPtr &Plan : VPlans) {
7546     if (Plan->hasVF(VF))
7547       return *Plan.get();
7548   }
7549   llvm_unreachable("No plan found!");
7550 }
7551 
7552 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7553   SmallVector<Metadata *, 4> MDs;
7554   // Reserve first location for self reference to the LoopID metadata node.
7555   MDs.push_back(nullptr);
7556   bool IsUnrollMetadata = false;
7557   MDNode *LoopID = L->getLoopID();
7558   if (LoopID) {
7559     // First find existing loop unrolling disable metadata.
7560     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7561       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7562       if (MD) {
7563         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7564         IsUnrollMetadata =
7565             S && S->getString().startswith("llvm.loop.unroll.disable");
7566       }
7567       MDs.push_back(LoopID->getOperand(i));
7568     }
7569   }
7570 
7571   if (!IsUnrollMetadata) {
7572     // Add runtime unroll disable metadata.
7573     LLVMContext &Context = L->getHeader()->getContext();
7574     SmallVector<Metadata *, 1> DisableOperands;
7575     DisableOperands.push_back(
7576         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7577     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7578     MDs.push_back(DisableNode);
7579     MDNode *NewLoopID = MDNode::get(Context, MDs);
7580     // Set operand 0 to refer to the loop id itself.
7581     NewLoopID->replaceOperandWith(0, NewLoopID);
7582     L->setLoopID(NewLoopID);
7583   }
7584 }
7585 
7586 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
7587                                            VPlan &BestVPlan,
7588                                            InnerLoopVectorizer &ILV,
7589                                            DominatorTree *DT) {
7590   LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
7591                     << '\n');
7592 
7593   // Perform the actual loop transformation.
7594 
7595   // 1. Set up the skeleton for vectorization, including vector pre-header and
7596   // middle block. The vector loop is created during VPlan execution.
7597   VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};
7598   Value *CanonicalIVStartValue;
7599   std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
7600       ILV.createVectorizedLoopSkeleton();
7601   ILV.collectPoisonGeneratingRecipes(State);
7602 
7603   ILV.printDebugTracesAtStart();
7604 
7605   //===------------------------------------------------===//
7606   //
7607   // Notice: any optimization or new instruction that go
7608   // into the code below should also be implemented in
7609   // the cost-model.
7610   //
7611   //===------------------------------------------------===//
7612 
7613   // 2. Copy and widen instructions from the old loop into the new loop.
7614   BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr),
7615                              ILV.getOrCreateVectorTripCount(nullptr),
7616                              CanonicalIVStartValue, State);
7617   BestVPlan.execute(&State);
7618 
7619   // Keep all loop hints from the original loop on the vector loop (we'll
7620   // replace the vectorizer-specific hints below).
7621   MDNode *OrigLoopID = OrigLoop->getLoopID();
7622 
7623   Optional<MDNode *> VectorizedLoopID =
7624       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7625                                       LLVMLoopVectorizeFollowupVectorized});
7626 
7627   VPBasicBlock *HeaderVPBB =
7628       BestVPlan.getVectorLoopRegion()->getEntryBasicBlock();
7629   Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
7630   if (VectorizedLoopID.hasValue())
7631     L->setLoopID(VectorizedLoopID.getValue());
7632   else {
7633     // Keep all loop hints from the original loop on the vector loop (we'll
7634     // replace the vectorizer-specific hints below).
7635     if (MDNode *LID = OrigLoop->getLoopID())
7636       L->setLoopID(LID);
7637 
7638     LoopVectorizeHints Hints(L, true, *ORE);
7639     Hints.setAlreadyVectorized();
7640   }
7641   // Disable runtime unrolling when vectorizing the epilogue loop.
7642   if (CanonicalIVStartValue)
7643     AddRuntimeUnrollDisableMetaData(L);
7644 
7645   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7646   //    predication, updating analyses.
7647   ILV.fixVectorizedLoop(State, BestVPlan);
7648 
7649   ILV.printDebugTracesAtEnd();
7650 }
7651 
7652 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
7653 void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
7654   for (const auto &Plan : VPlans)
7655     if (PrintVPlansInDotFormat)
7656       Plan->printDOT(O);
7657     else
7658       Plan->print(O);
7659 }
7660 #endif
7661 
7662 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
7663     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7664 
7665   // We create new control-flow for the vectorized loop, so the original exit
7666   // conditions will be dead after vectorization if it's only used by the
7667   // terminator
7668   SmallVector<BasicBlock*> ExitingBlocks;
7669   OrigLoop->getExitingBlocks(ExitingBlocks);
7670   for (auto *BB : ExitingBlocks) {
7671     auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0));
7672     if (!Cmp || !Cmp->hasOneUse())
7673       continue;
7674 
7675     // TODO: we should introduce a getUniqueExitingBlocks on Loop
7676     if (!DeadInstructions.insert(Cmp).second)
7677       continue;
7678 
7679     // The operands of the icmp is often a dead trunc, used by IndUpdate.
7680     // TODO: can recurse through operands in general
7681     for (Value *Op : Cmp->operands()) {
7682       if (isa<TruncInst>(Op) && Op->hasOneUse())
7683           DeadInstructions.insert(cast<Instruction>(Op));
7684     }
7685   }
7686 
7687   // We create new "steps" for induction variable updates to which the original
7688   // induction variables map. An original update instruction will be dead if
7689   // all its users except the induction variable are dead.
7690   auto *Latch = OrigLoop->getLoopLatch();
7691   for (auto &Induction : Legal->getInductionVars()) {
7692     PHINode *Ind = Induction.first;
7693     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
7694 
7695     // If the tail is to be folded by masking, the primary induction variable,
7696     // if exists, isn't dead: it will be used for masking. Don't kill it.
7697     if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())
7698       continue;
7699 
7700     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
7701           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
7702         }))
7703       DeadInstructions.insert(IndUpdate);
7704   }
7705 }
7706 
7707 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
7708 
7709 //===--------------------------------------------------------------------===//
7710 // EpilogueVectorizerMainLoop
7711 //===--------------------------------------------------------------------===//
7712 
7713 /// This function is partially responsible for generating the control flow
7714 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7715 std::pair<BasicBlock *, Value *>
7716 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
7717   MDNode *OrigLoopID = OrigLoop->getLoopID();
7718 
7719   // Workaround!  Compute the trip count of the original loop and cache it
7720   // before we start modifying the CFG.  This code has a systemic problem
7721   // wherein it tries to run analysis over partially constructed IR; this is
7722   // wrong, and not simply for SCEV.  The trip count of the original loop
7723   // simply happens to be prone to hitting this in practice.  In theory, we
7724   // can hit the same issue for any SCEV, or ValueTracking query done during
7725   // mutation.  See PR49900.
7726   getOrCreateTripCount(OrigLoop->getLoopPreheader());
7727   createVectorLoopSkeleton("");
7728 
7729   // Generate the code to check the minimum iteration count of the vector
7730   // epilogue (see below).
7731   EPI.EpilogueIterationCountCheck =
7732       emitIterationCountCheck(LoopScalarPreHeader, true);
7733   EPI.EpilogueIterationCountCheck->setName("iter.check");
7734 
7735   // Generate the code to check any assumptions that we've made for SCEV
7736   // expressions.
7737   EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader);
7738 
7739   // Generate the code that checks at runtime if arrays overlap. We put the
7740   // checks into a separate block to make the more common case of few elements
7741   // faster.
7742   EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader);
7743 
7744   // Generate the iteration count check for the main loop, *after* the check
7745   // for the epilogue loop, so that the path-length is shorter for the case
7746   // that goes directly through the vector epilogue. The longer-path length for
7747   // the main loop is compensated for, by the gain from vectorizing the larger
7748   // trip count. Note: the branch will get updated later on when we vectorize
7749   // the epilogue.
7750   EPI.MainLoopIterationCountCheck =
7751       emitIterationCountCheck(LoopScalarPreHeader, false);
7752 
7753   // Generate the induction variable.
7754   EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
7755 
7756   // Skip induction resume value creation here because they will be created in
7757   // the second pass. If we created them here, they wouldn't be used anyway,
7758   // because the vplan in the second pass still contains the inductions from the
7759   // original loop.
7760 
7761   return {completeLoopSkeleton(OrigLoopID), nullptr};
7762 }
7763 
7764 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7765   LLVM_DEBUG({
7766     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7767            << "Main Loop VF:" << EPI.MainLoopVF
7768            << ", Main Loop UF:" << EPI.MainLoopUF
7769            << ", Epilogue Loop VF:" << EPI.EpilogueVF
7770            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7771   });
7772 }
7773 
7774 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7775   DEBUG_WITH_TYPE(VerboseDebug, {
7776     dbgs() << "intermediate fn:\n"
7777            << *OrigLoop->getHeader()->getParent() << "\n";
7778   });
7779 }
7780 
7781 BasicBlock *
7782 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
7783                                                     bool ForEpilogue) {
7784   assert(Bypass && "Expected valid bypass basic block.");
7785   ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7786   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7787   Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
7788   // Reuse existing vector loop preheader for TC checks.
7789   // Note that new preheader block is generated for vector loop.
7790   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7791   IRBuilder<> Builder(TCCheckBlock->getTerminator());
7792 
7793   // Generate code to check if the loop's trip count is less than VF * UF of the
7794   // main vector loop.
7795   auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ?
7796       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7797 
7798   Value *CheckMinIters = Builder.CreateICmp(
7799       P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
7800       "min.iters.check");
7801 
7802   if (!ForEpilogue)
7803     TCCheckBlock->setName("vector.main.loop.iter.check");
7804 
7805   // Create new preheader for vector loop.
7806   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7807                                    DT, LI, nullptr, "vector.ph");
7808 
7809   if (ForEpilogue) {
7810     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7811                                  DT->getNode(Bypass)->getIDom()) &&
7812            "TC check is expected to dominate Bypass");
7813 
7814     // Update dominator for Bypass & LoopExit.
7815     DT->changeImmediateDominator(Bypass, TCCheckBlock);
7816     if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
7817       // For loops with multiple exits, there's no edge from the middle block
7818       // to exit blocks (as the epilogue must run) and thus no need to update
7819       // the immediate dominator of the exit blocks.
7820       DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
7821 
7822     LoopBypassBlocks.push_back(TCCheckBlock);
7823 
7824     // Save the trip count so we don't have to regenerate it in the
7825     // vec.epilog.iter.check. This is safe to do because the trip count
7826     // generated here dominates the vector epilog iter check.
7827     EPI.TripCount = Count;
7828   }
7829 
7830   ReplaceInstWithInst(
7831       TCCheckBlock->getTerminator(),
7832       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7833 
7834   return TCCheckBlock;
7835 }
7836 
7837 //===--------------------------------------------------------------------===//
7838 // EpilogueVectorizerEpilogueLoop
7839 //===--------------------------------------------------------------------===//
7840 
7841 /// This function is partially responsible for generating the control flow
7842 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7843 std::pair<BasicBlock *, Value *>
7844 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
7845   MDNode *OrigLoopID = OrigLoop->getLoopID();
7846   createVectorLoopSkeleton("vec.epilog.");
7847 
7848   // Now, compare the remaining count and if there aren't enough iterations to
7849   // execute the vectorized epilogue skip to the scalar part.
7850   BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
7851   VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
7852   LoopVectorPreHeader =
7853       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
7854                  LI, nullptr, "vec.epilog.ph");
7855   emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
7856                                           VecEpilogueIterationCountCheck);
7857 
7858   // Adjust the control flow taking the state info from the main loop
7859   // vectorization into account.
7860   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7861          "expected this to be saved from the previous pass.");
7862   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7863       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7864 
7865   DT->changeImmediateDominator(LoopVectorPreHeader,
7866                                EPI.MainLoopIterationCountCheck);
7867 
7868   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7869       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7870 
7871   if (EPI.SCEVSafetyCheck)
7872     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
7873         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7874   if (EPI.MemSafetyCheck)
7875     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
7876         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7877 
7878   DT->changeImmediateDominator(
7879       VecEpilogueIterationCountCheck,
7880       VecEpilogueIterationCountCheck->getSinglePredecessor());
7881 
7882   DT->changeImmediateDominator(LoopScalarPreHeader,
7883                                EPI.EpilogueIterationCountCheck);
7884   if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
7885     // If there is an epilogue which must run, there's no edge from the
7886     // middle block to exit blocks  and thus no need to update the immediate
7887     // dominator of the exit blocks.
7888     DT->changeImmediateDominator(LoopExitBlock,
7889                                  EPI.EpilogueIterationCountCheck);
7890 
7891   // Keep track of bypass blocks, as they feed start values to the induction
7892   // phis in the scalar loop preheader.
7893   if (EPI.SCEVSafetyCheck)
7894     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
7895   if (EPI.MemSafetyCheck)
7896     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
7897   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
7898 
7899   // The vec.epilog.iter.check block may contain Phi nodes from reductions which
7900   // merge control-flow from the latch block and the middle block. Update the
7901   // incoming values here and move the Phi into the preheader.
7902   SmallVector<PHINode *, 4> PhisInBlock;
7903   for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
7904     PhisInBlock.push_back(&Phi);
7905 
7906   for (PHINode *Phi : PhisInBlock) {
7907     Phi->replaceIncomingBlockWith(
7908         VecEpilogueIterationCountCheck->getSinglePredecessor(),
7909         VecEpilogueIterationCountCheck);
7910     Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
7911     if (EPI.SCEVSafetyCheck)
7912       Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
7913     if (EPI.MemSafetyCheck)
7914       Phi->removeIncomingValue(EPI.MemSafetyCheck);
7915     Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
7916   }
7917 
7918   // Generate a resume induction for the vector epilogue and put it in the
7919   // vector epilogue preheader
7920   Type *IdxTy = Legal->getWidestInductionType();
7921   PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
7922                                          LoopVectorPreHeader->getFirstNonPHI());
7923   EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
7924   EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
7925                            EPI.MainLoopIterationCountCheck);
7926 
7927   // Generate induction resume values. These variables save the new starting
7928   // indexes for the scalar loop. They are used to test if there are any tail
7929   // iterations left once the vector loop has completed.
7930   // Note that when the vectorized epilogue is skipped due to iteration count
7931   // check, then the resume value for the induction variable comes from
7932   // the trip count of the main vector loop, hence passing the AdditionalBypass
7933   // argument.
7934   createInductionResumeValues({VecEpilogueIterationCountCheck,
7935                                EPI.VectorTripCount} /* AdditionalBypass */);
7936 
7937   return {completeLoopSkeleton(OrigLoopID), EPResumeVal};
7938 }
7939 
7940 BasicBlock *
7941 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
7942     BasicBlock *Bypass, BasicBlock *Insert) {
7943 
7944   assert(EPI.TripCount &&
7945          "Expected trip count to have been safed in the first pass.");
7946   assert(
7947       (!isa<Instruction>(EPI.TripCount) ||
7948        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
7949       "saved trip count does not dominate insertion point.");
7950   Value *TC = EPI.TripCount;
7951   IRBuilder<> Builder(Insert->getTerminator());
7952   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
7953 
7954   // Generate code to check if the loop's trip count is less than VF * UF of the
7955   // vector epilogue loop.
7956   auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ?
7957       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7958 
7959   Value *CheckMinIters =
7960       Builder.CreateICmp(P, Count,
7961                          createStepForVF(Builder, Count->getType(),
7962                                          EPI.EpilogueVF, EPI.EpilogueUF),
7963                          "min.epilog.iters.check");
7964 
7965   ReplaceInstWithInst(
7966       Insert->getTerminator(),
7967       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7968 
7969   LoopBypassBlocks.push_back(Insert);
7970   return Insert;
7971 }
7972 
7973 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
7974   LLVM_DEBUG({
7975     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7976            << "Epilogue Loop VF:" << EPI.EpilogueVF
7977            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7978   });
7979 }
7980 
7981 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
7982   DEBUG_WITH_TYPE(VerboseDebug, {
7983     dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
7984   });
7985 }
7986 
7987 bool LoopVectorizationPlanner::getDecisionAndClampRange(
7988     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
7989   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
7990   bool PredicateAtRangeStart = Predicate(Range.Start);
7991 
7992   for (ElementCount TmpVF = Range.Start * 2;
7993        ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
7994     if (Predicate(TmpVF) != PredicateAtRangeStart) {
7995       Range.End = TmpVF;
7996       break;
7997     }
7998 
7999   return PredicateAtRangeStart;
8000 }
8001 
8002 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
8003 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
8004 /// of VF's starting at a given VF and extending it as much as possible. Each
8005 /// vectorization decision can potentially shorten this sub-range during
8006 /// buildVPlan().
8007 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
8008                                            ElementCount MaxVF) {
8009   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8010   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8011     VFRange SubRange = {VF, MaxVFPlusOne};
8012     VPlans.push_back(buildVPlan(SubRange));
8013     VF = SubRange.End;
8014   }
8015 }
8016 
8017 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
8018                                          VPlanPtr &Plan) {
8019   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8020 
8021   // Look for cached value.
8022   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8023   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8024   if (ECEntryIt != EdgeMaskCache.end())
8025     return ECEntryIt->second;
8026 
8027   VPValue *SrcMask = createBlockInMask(Src, Plan);
8028 
8029   // The terminator has to be a branch inst!
8030   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8031   assert(BI && "Unexpected terminator found");
8032 
8033   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8034     return EdgeMaskCache[Edge] = SrcMask;
8035 
8036   // If source is an exiting block, we know the exit edge is dynamically dead
8037   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
8038   // adding uses of an otherwise potentially dead instruction.
8039   if (OrigLoop->isLoopExiting(Src))
8040     return EdgeMaskCache[Edge] = SrcMask;
8041 
8042   VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
8043   assert(EdgeMask && "No Edge Mask found for condition");
8044 
8045   if (BI->getSuccessor(0) != Dst)
8046     EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
8047 
8048   if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8049     // The condition is 'SrcMask && EdgeMask', which is equivalent to
8050     // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8051     // The select version does not introduce new UB if SrcMask is false and
8052     // EdgeMask is poison. Using 'and' here introduces undefined behavior.
8053     VPValue *False = Plan->getOrAddVPValue(
8054         ConstantInt::getFalse(BI->getCondition()->getType()));
8055     EdgeMask =
8056         Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc());
8057   }
8058 
8059   return EdgeMaskCache[Edge] = EdgeMask;
8060 }
8061 
8062 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
8063   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8064 
8065   // Look for cached value.
8066   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
8067   if (BCEntryIt != BlockMaskCache.end())
8068     return BCEntryIt->second;
8069 
8070   // All-one mask is modelled as no-mask following the convention for masked
8071   // load/store/gather/scatter. Initialize BlockMask to no-mask.
8072   VPValue *BlockMask = nullptr;
8073 
8074   if (OrigLoop->getHeader() == BB) {
8075     if (!CM.blockNeedsPredicationForAnyReason(BB))
8076       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
8077 
8078     // Introduce the early-exit compare IV <= BTC to form header block mask.
8079     // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8080     // constructing the desired canonical IV in the header block as its first
8081     // non-phi instructions.
8082     assert(CM.foldTailByMasking() && "must fold the tail");
8083     VPBasicBlock *HeaderVPBB =
8084         Plan->getVectorLoopRegion()->getEntryBasicBlock();
8085     auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8086     auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV());
8087     HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi());
8088 
8089     VPBuilder::InsertPointGuard Guard(Builder);
8090     Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
8091     if (CM.TTI.emitGetActiveLaneMask()) {
8092       VPValue *TC = Plan->getOrCreateTripCount();
8093       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC});
8094     } else {
8095       VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
8096       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
8097     }
8098     return BlockMaskCache[BB] = BlockMask;
8099   }
8100 
8101   // This is the block mask. We OR all incoming edges.
8102   for (auto *Predecessor : predecessors(BB)) {
8103     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8104     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
8105       return BlockMaskCache[BB] = EdgeMask;
8106 
8107     if (!BlockMask) { // BlockMask has its initialized nullptr value.
8108       BlockMask = EdgeMask;
8109       continue;
8110     }
8111 
8112     BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8113   }
8114 
8115   return BlockMaskCache[BB] = BlockMask;
8116 }
8117 
8118 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
8119                                                 ArrayRef<VPValue *> Operands,
8120                                                 VFRange &Range,
8121                                                 VPlanPtr &Plan) {
8122   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8123          "Must be called with either a load or store");
8124 
8125   auto willWiden = [&](ElementCount VF) -> bool {
8126     LoopVectorizationCostModel::InstWidening Decision =
8127         CM.getWideningDecision(I, VF);
8128     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8129            "CM decision should be taken at this point.");
8130     if (Decision == LoopVectorizationCostModel::CM_Interleave)
8131       return true;
8132     if (CM.isScalarAfterVectorization(I, VF) ||
8133         CM.isProfitableToScalarize(I, VF))
8134       return false;
8135     return Decision != LoopVectorizationCostModel::CM_Scalarize;
8136   };
8137 
8138   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8139     return nullptr;
8140 
8141   VPValue *Mask = nullptr;
8142   if (Legal->isMaskRequired(I))
8143     Mask = createBlockInMask(I->getParent(), Plan);
8144 
8145   // Determine if the pointer operand of the access is either consecutive or
8146   // reverse consecutive.
8147   LoopVectorizationCostModel::InstWidening Decision =
8148       CM.getWideningDecision(I, Range.Start);
8149   bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
8150   bool Consecutive =
8151       Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
8152 
8153   if (LoadInst *Load = dyn_cast<LoadInst>(I))
8154     return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask,
8155                                               Consecutive, Reverse);
8156 
8157   StoreInst *Store = cast<StoreInst>(I);
8158   return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0],
8159                                             Mask, Consecutive, Reverse);
8160 }
8161 
8162 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
8163 /// insert a recipe to expand the step for the induction recipe.
8164 static VPWidenIntOrFpInductionRecipe *createWidenInductionRecipes(
8165     PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start,
8166     const InductionDescriptor &IndDesc, LoopVectorizationCostModel &CM,
8167     VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, VFRange &Range) {
8168   // Returns true if an instruction \p I should be scalarized instead of
8169   // vectorized for the chosen vectorization factor.
8170   auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) {
8171     return CM.isScalarAfterVectorization(I, VF) ||
8172            CM.isProfitableToScalarize(I, VF);
8173   };
8174 
8175   bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange(
8176       [&](ElementCount VF) {
8177         return ShouldScalarizeInstruction(PhiOrTrunc, VF);
8178       },
8179       Range);
8180   assert(IndDesc.getStartValue() ==
8181          Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
8182   assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
8183          "step must be loop invariant");
8184 
8185   VPValue *Step =
8186       vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE);
8187   if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8188     return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI,
8189                                              !NeedsScalarIVOnly);
8190   }
8191   assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
8192   return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc,
8193                                            !NeedsScalarIVOnly);
8194 }
8195 
8196 VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI(
8197     PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) {
8198 
8199   // Check if this is an integer or fp induction. If so, build the recipe that
8200   // produces its scalar and vector values.
8201   if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8202     return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, CM, Plan,
8203                                        *PSE.getSE(), *OrigLoop, Range);
8204 
8205   // Check if this is pointer induction. If so, build the recipe for it.
8206   if (auto *II = Legal->getPointerInductionDescriptor(Phi))
8207     return new VPWidenPointerInductionRecipe(Phi, Operands[0], *II,
8208                                              *PSE.getSE());
8209   return nullptr;
8210 }
8211 
8212 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8213     TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) {
8214   // Optimize the special case where the source is a constant integer
8215   // induction variable. Notice that we can only optimize the 'trunc' case
8216   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8217   // (c) other casts depend on pointer size.
8218 
8219   // Determine whether \p K is a truncation based on an induction variable that
8220   // can be optimized.
8221   auto isOptimizableIVTruncate =
8222       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8223     return [=](ElementCount VF) -> bool {
8224       return CM.isOptimizableIVTruncate(K, VF);
8225     };
8226   };
8227 
8228   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8229           isOptimizableIVTruncate(I), Range)) {
8230 
8231     auto *Phi = cast<PHINode>(I->getOperand(0));
8232     const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8233     VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
8234     return createWidenInductionRecipes(Phi, I, Start, II, CM, Plan,
8235                                        *PSE.getSE(), *OrigLoop, Range);
8236   }
8237   return nullptr;
8238 }
8239 
8240 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
8241                                                 ArrayRef<VPValue *> Operands,
8242                                                 VPlanPtr &Plan) {
8243   // If all incoming values are equal, the incoming VPValue can be used directly
8244   // instead of creating a new VPBlendRecipe.
8245   VPValue *FirstIncoming = Operands[0];
8246   if (all_of(Operands, [FirstIncoming](const VPValue *Inc) {
8247         return FirstIncoming == Inc;
8248       })) {
8249     return Operands[0];
8250   }
8251 
8252   unsigned NumIncoming = Phi->getNumIncomingValues();
8253   // For in-loop reductions, we do not need to create an additional select.
8254   VPValue *InLoopVal = nullptr;
8255   for (unsigned In = 0; In < NumIncoming; In++) {
8256     PHINode *PhiOp =
8257         dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue());
8258     if (PhiOp && CM.isInLoopReduction(PhiOp)) {
8259       assert(!InLoopVal && "Found more than one in-loop reduction!");
8260       InLoopVal = Operands[In];
8261     }
8262   }
8263 
8264   assert((!InLoopVal || NumIncoming == 2) &&
8265          "Found an in-loop reduction for PHI with unexpected number of "
8266          "incoming values");
8267   if (InLoopVal)
8268     return Operands[Operands[0] == InLoopVal ? 1 : 0];
8269 
8270   // We know that all PHIs in non-header blocks are converted into selects, so
8271   // we don't have to worry about the insertion order and we can just use the
8272   // builder. At this point we generate the predication tree. There may be
8273   // duplications since this is a simple recursive scan, but future
8274   // optimizations will clean it up.
8275   SmallVector<VPValue *, 2> OperandsWithMask;
8276 
8277   for (unsigned In = 0; In < NumIncoming; In++) {
8278     VPValue *EdgeMask =
8279       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
8280     assert((EdgeMask || NumIncoming == 1) &&
8281            "Multiple predecessors with one having a full mask");
8282     OperandsWithMask.push_back(Operands[In]);
8283     if (EdgeMask)
8284       OperandsWithMask.push_back(EdgeMask);
8285   }
8286   return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask));
8287 }
8288 
8289 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8290                                                    ArrayRef<VPValue *> Operands,
8291                                                    VFRange &Range) const {
8292 
8293   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8294       [this, CI](ElementCount VF) {
8295         return CM.isScalarWithPredication(CI, VF);
8296       },
8297       Range);
8298 
8299   if (IsPredicated)
8300     return nullptr;
8301 
8302   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8303   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8304              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8305              ID == Intrinsic::pseudoprobe ||
8306              ID == Intrinsic::experimental_noalias_scope_decl))
8307     return nullptr;
8308 
8309   auto willWiden = [&](ElementCount VF) -> bool {
8310     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8311     // The following case may be scalarized depending on the VF.
8312     // The flag shows whether we use Intrinsic or a usual Call for vectorized
8313     // version of the instruction.
8314     // Is it beneficial to perform intrinsic call compared to lib call?
8315     bool NeedToScalarize = false;
8316     InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
8317     InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;
8318     bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
8319     return UseVectorIntrinsic || !NeedToScalarize;
8320   };
8321 
8322   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8323     return nullptr;
8324 
8325   ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size());
8326   return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()));
8327 }
8328 
8329 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8330   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8331          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8332   // Instruction should be widened, unless it is scalar after vectorization,
8333   // scalarization is profitable or it is predicated.
8334   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8335     return CM.isScalarAfterVectorization(I, VF) ||
8336            CM.isProfitableToScalarize(I, VF) ||
8337            CM.isScalarWithPredication(I, VF);
8338   };
8339   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8340                                                              Range);
8341 }
8342 
8343 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8344                                            ArrayRef<VPValue *> Operands) const {
8345   auto IsVectorizableOpcode = [](unsigned Opcode) {
8346     switch (Opcode) {
8347     case Instruction::Add:
8348     case Instruction::And:
8349     case Instruction::AShr:
8350     case Instruction::BitCast:
8351     case Instruction::FAdd:
8352     case Instruction::FCmp:
8353     case Instruction::FDiv:
8354     case Instruction::FMul:
8355     case Instruction::FNeg:
8356     case Instruction::FPExt:
8357     case Instruction::FPToSI:
8358     case Instruction::FPToUI:
8359     case Instruction::FPTrunc:
8360     case Instruction::FRem:
8361     case Instruction::FSub:
8362     case Instruction::ICmp:
8363     case Instruction::IntToPtr:
8364     case Instruction::LShr:
8365     case Instruction::Mul:
8366     case Instruction::Or:
8367     case Instruction::PtrToInt:
8368     case Instruction::SDiv:
8369     case Instruction::Select:
8370     case Instruction::SExt:
8371     case Instruction::Shl:
8372     case Instruction::SIToFP:
8373     case Instruction::SRem:
8374     case Instruction::Sub:
8375     case Instruction::Trunc:
8376     case Instruction::UDiv:
8377     case Instruction::UIToFP:
8378     case Instruction::URem:
8379     case Instruction::Xor:
8380     case Instruction::ZExt:
8381     case Instruction::Freeze:
8382       return true;
8383     }
8384     return false;
8385   };
8386 
8387   if (!IsVectorizableOpcode(I->getOpcode()))
8388     return nullptr;
8389 
8390   // Success: widen this instruction.
8391   return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8392 }
8393 
8394 void VPRecipeBuilder::fixHeaderPhis() {
8395   BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8396   for (VPHeaderPHIRecipe *R : PhisToFix) {
8397     auto *PN = cast<PHINode>(R->getUnderlyingValue());
8398     VPRecipeBase *IncR =
8399         getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8400     R->addOperand(IncR->getVPSingleValue());
8401   }
8402 }
8403 
8404 VPBasicBlock *VPRecipeBuilder::handleReplication(
8405     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
8406     VPlanPtr &Plan) {
8407   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8408       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8409       Range);
8410 
8411   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8412       [&](ElementCount VF) { return CM.isPredicatedInst(I, VF, IsUniform); },
8413       Range);
8414 
8415   // Even if the instruction is not marked as uniform, there are certain
8416   // intrinsic calls that can be effectively treated as such, so we check for
8417   // them here. Conservatively, we only do this for scalable vectors, since
8418   // for fixed-width VFs we can always fall back on full scalarization.
8419   if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8420     switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8421     case Intrinsic::assume:
8422     case Intrinsic::lifetime_start:
8423     case Intrinsic::lifetime_end:
8424       // For scalable vectors if one of the operands is variant then we still
8425       // want to mark as uniform, which will generate one instruction for just
8426       // the first lane of the vector. We can't scalarize the call in the same
8427       // way as for fixed-width vectors because we don't know how many lanes
8428       // there are.
8429       //
8430       // The reasons for doing it this way for scalable vectors are:
8431       //   1. For the assume intrinsic generating the instruction for the first
8432       //      lane is still be better than not generating any at all. For
8433       //      example, the input may be a splat across all lanes.
8434       //   2. For the lifetime start/end intrinsics the pointer operand only
8435       //      does anything useful when the input comes from a stack object,
8436       //      which suggests it should always be uniform. For non-stack objects
8437       //      the effect is to poison the object, which still allows us to
8438       //      remove the call.
8439       IsUniform = true;
8440       break;
8441     default:
8442       break;
8443     }
8444   }
8445 
8446   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
8447                                        IsUniform, IsPredicated);
8448   setRecipe(I, Recipe);
8449   Plan->addVPValue(I, Recipe);
8450 
8451   // Find if I uses a predicated instruction. If so, it will use its scalar
8452   // value. Avoid hoisting the insert-element which packs the scalar value into
8453   // a vector value, as that happens iff all users use the vector value.
8454   for (VPValue *Op : Recipe->operands()) {
8455     auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef());
8456     if (!PredR)
8457       continue;
8458     auto *RepR =
8459         cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef());
8460     assert(RepR->isPredicated() &&
8461            "expected Replicate recipe to be predicated");
8462     RepR->setAlsoPack(false);
8463   }
8464 
8465   // Finalize the recipe for Instr, first if it is not predicated.
8466   if (!IsPredicated) {
8467     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8468     VPBB->appendRecipe(Recipe);
8469     return VPBB;
8470   }
8471   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8472 
8473   VPBlockBase *SingleSucc = VPBB->getSingleSuccessor();
8474   assert(SingleSucc && "VPBB must have a single successor when handling "
8475                        "predicated replication.");
8476   VPBlockUtils::disconnectBlocks(VPBB, SingleSucc);
8477   // Record predicated instructions for above packing optimizations.
8478   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
8479   VPBlockUtils::insertBlockAfter(Region, VPBB);
8480   auto *RegSucc = new VPBasicBlock();
8481   VPBlockUtils::insertBlockAfter(RegSucc, Region);
8482   VPBlockUtils::connectBlocks(RegSucc, SingleSucc);
8483   return RegSucc;
8484 }
8485 
8486 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
8487                                                       VPRecipeBase *PredRecipe,
8488                                                       VPlanPtr &Plan) {
8489   // Instructions marked for predication are replicated and placed under an
8490   // if-then construct to prevent side-effects.
8491 
8492   // Generate recipes to compute the block mask for this region.
8493   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
8494 
8495   // Build the triangular if-then region.
8496   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
8497   assert(Instr->getParent() && "Predicated instruction not in any basic block");
8498   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
8499   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
8500   auto *PHIRecipe = Instr->getType()->isVoidTy()
8501                         ? nullptr
8502                         : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));
8503   if (PHIRecipe) {
8504     Plan->removeVPValueFor(Instr);
8505     Plan->addVPValue(Instr, PHIRecipe);
8506   }
8507   auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
8508   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
8509   VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true);
8510 
8511   // Note: first set Entry as region entry and then connect successors starting
8512   // from it in order, to propagate the "parent" of each VPBasicBlock.
8513   VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
8514   VPBlockUtils::connectBlocks(Pred, Exiting);
8515 
8516   return Region;
8517 }
8518 
8519 VPRecipeOrVPValueTy
8520 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8521                                         ArrayRef<VPValue *> Operands,
8522                                         VFRange &Range, VPlanPtr &Plan) {
8523   // First, check for specific widening recipes that deal with inductions, Phi
8524   // nodes, calls and memory operations.
8525   VPRecipeBase *Recipe;
8526   if (auto Phi = dyn_cast<PHINode>(Instr)) {
8527     if (Phi->getParent() != OrigLoop->getHeader())
8528       return tryToBlend(Phi, Operands, Plan);
8529     if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range)))
8530       return toVPRecipeResult(Recipe);
8531 
8532     VPHeaderPHIRecipe *PhiRecipe = nullptr;
8533     assert((Legal->isReductionVariable(Phi) ||
8534             Legal->isFirstOrderRecurrence(Phi)) &&
8535            "can only widen reductions and first-order recurrences here");
8536     VPValue *StartV = Operands[0];
8537     if (Legal->isReductionVariable(Phi)) {
8538       const RecurrenceDescriptor &RdxDesc =
8539           Legal->getReductionVars().find(Phi)->second;
8540       assert(RdxDesc.getRecurrenceStartValue() ==
8541              Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8542       PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8543                                            CM.isInLoopReduction(Phi),
8544                                            CM.useOrderedReductions(RdxDesc));
8545     } else {
8546       PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8547     }
8548 
8549     // Record the incoming value from the backedge, so we can add the incoming
8550     // value from the backedge after all recipes have been created.
8551     recordRecipeOf(cast<Instruction>(
8552         Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())));
8553     PhisToFix.push_back(PhiRecipe);
8554     return toVPRecipeResult(PhiRecipe);
8555   }
8556 
8557   if (isa<TruncInst>(Instr) &&
8558       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,
8559                                                Range, *Plan)))
8560     return toVPRecipeResult(Recipe);
8561 
8562   // All widen recipes below deal only with VF > 1.
8563   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8564           [&](ElementCount VF) { return VF.isScalar(); }, Range))
8565     return nullptr;
8566 
8567   if (auto *CI = dyn_cast<CallInst>(Instr))
8568     return toVPRecipeResult(tryToWidenCall(CI, Operands, Range));
8569 
8570   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8571     return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
8572 
8573   if (!shouldWiden(Instr, Range))
8574     return nullptr;
8575 
8576   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8577     return toVPRecipeResult(new VPWidenGEPRecipe(
8578         GEP, make_range(Operands.begin(), Operands.end()), OrigLoop));
8579 
8580   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8581     bool InvariantCond =
8582         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
8583     return toVPRecipeResult(new VPWidenSelectRecipe(
8584         *SI, make_range(Operands.begin(), Operands.end()), InvariantCond));
8585   }
8586 
8587   return toVPRecipeResult(tryToWiden(Instr, Operands));
8588 }
8589 
8590 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8591                                                         ElementCount MaxVF) {
8592   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8593 
8594   // Collect instructions from the original loop that will become trivially dead
8595   // in the vectorized loop. We don't need to vectorize these instructions. For
8596   // example, original induction update instructions can become dead because we
8597   // separately emit induction "steps" when generating code for the new loop.
8598   // Similarly, we create a new latch condition when setting up the structure
8599   // of the new loop, so the old one can become dead.
8600   SmallPtrSet<Instruction *, 4> DeadInstructions;
8601   collectTriviallyDeadInstructions(DeadInstructions);
8602 
8603   // Add assume instructions we need to drop to DeadInstructions, to prevent
8604   // them from being added to the VPlan.
8605   // TODO: We only need to drop assumes in blocks that get flattend. If the
8606   // control flow is preserved, we should keep them.
8607   auto &ConditionalAssumes = Legal->getConditionalAssumes();
8608   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
8609 
8610   MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
8611   // Dead instructions do not need sinking. Remove them from SinkAfter.
8612   for (Instruction *I : DeadInstructions)
8613     SinkAfter.erase(I);
8614 
8615   // Cannot sink instructions after dead instructions (there won't be any
8616   // recipes for them). Instead, find the first non-dead previous instruction.
8617   for (auto &P : Legal->getSinkAfter()) {
8618     Instruction *SinkTarget = P.second;
8619     Instruction *FirstInst = &*SinkTarget->getParent()->begin();
8620     (void)FirstInst;
8621     while (DeadInstructions.contains(SinkTarget)) {
8622       assert(
8623           SinkTarget != FirstInst &&
8624           "Must find a live instruction (at least the one feeding the "
8625           "first-order recurrence PHI) before reaching beginning of the block");
8626       SinkTarget = SinkTarget->getPrevNode();
8627       assert(SinkTarget != P.first &&
8628              "sink source equals target, no sinking required");
8629     }
8630     P.second = SinkTarget;
8631   }
8632 
8633   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8634   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8635     VFRange SubRange = {VF, MaxVFPlusOne};
8636     VPlans.push_back(
8637         buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
8638     VF = SubRange.End;
8639   }
8640 }
8641 
8642 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header, a
8643 // CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a
8644 // BranchOnCount VPInstruction to the latch.
8645 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
8646                                   bool HasNUW) {
8647   Value *StartIdx = ConstantInt::get(IdxTy, 0);
8648   auto *StartV = Plan.getOrAddVPValue(StartIdx);
8649 
8650   auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
8651   VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
8652   VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
8653   Header->insert(CanonicalIVPHI, Header->begin());
8654 
8655   auto *CanonicalIVIncrement =
8656       new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW
8657                                : VPInstruction::CanonicalIVIncrement,
8658                         {CanonicalIVPHI}, DL);
8659   CanonicalIVPHI->addOperand(CanonicalIVIncrement);
8660 
8661   VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
8662   EB->appendRecipe(CanonicalIVIncrement);
8663 
8664   auto *BranchOnCount =
8665       new VPInstruction(VPInstruction::BranchOnCount,
8666                         {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
8667   EB->appendRecipe(BranchOnCount);
8668 }
8669 
8670 // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
8671 // original exit block.
8672 static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB,
8673                                 VPBasicBlock *MiddleVPBB, Loop *OrigLoop,
8674                                 VPlan &Plan) {
8675   BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock();
8676   BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
8677   // Only handle single-exit loops with unique exit blocks for now.
8678   if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB)
8679     return;
8680 
8681   // Introduce VPUsers modeling the exit values.
8682   for (PHINode &ExitPhi : ExitBB->phis()) {
8683     Value *IncomingValue =
8684         ExitPhi.getIncomingValueForBlock(ExitingBB);
8685     VPValue *V = Plan.getOrAddVPValue(IncomingValue, true);
8686     Plan.addLiveOut(&ExitPhi, V);
8687   }
8688 }
8689 
8690 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
8691     VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
8692     const MapVector<Instruction *, Instruction *> &SinkAfter) {
8693 
8694   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8695 
8696   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
8697 
8698   // ---------------------------------------------------------------------------
8699   // Pre-construction: record ingredients whose recipes we'll need to further
8700   // process after constructing the initial VPlan.
8701   // ---------------------------------------------------------------------------
8702 
8703   // Mark instructions we'll need to sink later and their targets as
8704   // ingredients whose recipe we'll need to record.
8705   for (auto &Entry : SinkAfter) {
8706     RecipeBuilder.recordRecipeOf(Entry.first);
8707     RecipeBuilder.recordRecipeOf(Entry.second);
8708   }
8709   for (auto &Reduction : CM.getInLoopReductionChains()) {
8710     PHINode *Phi = Reduction.first;
8711     RecurKind Kind =
8712         Legal->getReductionVars().find(Phi)->second.getRecurrenceKind();
8713     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8714 
8715     RecipeBuilder.recordRecipeOf(Phi);
8716     for (auto &R : ReductionOperations) {
8717       RecipeBuilder.recordRecipeOf(R);
8718       // For min/max reductions, where we have a pair of icmp/select, we also
8719       // need to record the ICmp recipe, so it can be removed later.
8720       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
8721              "Only min/max recurrences allowed for inloop reductions");
8722       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
8723         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
8724     }
8725   }
8726 
8727   // For each interleave group which is relevant for this (possibly trimmed)
8728   // Range, add it to the set of groups to be later applied to the VPlan and add
8729   // placeholders for its members' Recipes which we'll be replacing with a
8730   // single VPInterleaveRecipe.
8731   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8732     auto applyIG = [IG, this](ElementCount VF) -> bool {
8733       return (VF.isVector() && // Query is illegal for VF == 1
8734               CM.getWideningDecision(IG->getInsertPos(), VF) ==
8735                   LoopVectorizationCostModel::CM_Interleave);
8736     };
8737     if (!getDecisionAndClampRange(applyIG, Range))
8738       continue;
8739     InterleaveGroups.insert(IG);
8740     for (unsigned i = 0; i < IG->getFactor(); i++)
8741       if (Instruction *Member = IG->getMember(i))
8742         RecipeBuilder.recordRecipeOf(Member);
8743   };
8744 
8745   // ---------------------------------------------------------------------------
8746   // Build initial VPlan: Scan the body of the loop in a topological order to
8747   // visit each basic block after having visited its predecessor basic blocks.
8748   // ---------------------------------------------------------------------------
8749 
8750   // Create initial VPlan skeleton, starting with a block for the pre-header,
8751   // followed by a region for the vector loop, followed by the middle block. The
8752   // skeleton vector loop region contains a header and latch block.
8753   VPBasicBlock *Preheader = new VPBasicBlock("vector.ph");
8754   auto Plan = std::make_unique<VPlan>(Preheader);
8755 
8756   VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
8757   VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
8758   VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
8759   auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop");
8760   VPBlockUtils::insertBlockAfter(TopRegion, Preheader);
8761   VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block");
8762   VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion);
8763 
8764   Instruction *DLInst =
8765       getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
8766   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(),
8767                         DLInst ? DLInst->getDebugLoc() : DebugLoc(),
8768                         !CM.foldTailByMasking());
8769 
8770   // Scan the body of the loop in a topological order to visit each basic block
8771   // after having visited its predecessor basic blocks.
8772   LoopBlocksDFS DFS(OrigLoop);
8773   DFS.perform(LI);
8774 
8775   VPBasicBlock *VPBB = HeaderVPBB;
8776   SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove;
8777   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8778     // Relevant instructions from basic block BB will be grouped into VPRecipe
8779     // ingredients and fill a new VPBasicBlock.
8780     unsigned VPBBsForBB = 0;
8781     if (VPBB != HeaderVPBB)
8782       VPBB->setName(BB->getName());
8783     Builder.setInsertPoint(VPBB);
8784 
8785     // Introduce each ingredient into VPlan.
8786     // TODO: Model and preserve debug intrinsics in VPlan.
8787     for (Instruction &I : BB->instructionsWithoutDebug()) {
8788       Instruction *Instr = &I;
8789 
8790       // First filter out irrelevant instructions, to ensure no recipes are
8791       // built for them.
8792       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
8793         continue;
8794 
8795       SmallVector<VPValue *, 4> Operands;
8796       auto *Phi = dyn_cast<PHINode>(Instr);
8797       if (Phi && Phi->getParent() == OrigLoop->getHeader()) {
8798         Operands.push_back(Plan->getOrAddVPValue(
8799             Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
8800       } else {
8801         auto OpRange = Plan->mapToVPValues(Instr->operands());
8802         Operands = {OpRange.begin(), OpRange.end()};
8803       }
8804 
8805       // Invariant stores inside loop will be deleted and a single store
8806       // with the final reduction value will be added to the exit block
8807       StoreInst *SI;
8808       if ((SI = dyn_cast<StoreInst>(&I)) &&
8809           Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
8810         continue;
8811 
8812       if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
8813               Instr, Operands, Range, Plan)) {
8814         // If Instr can be simplified to an existing VPValue, use it.
8815         if (RecipeOrValue.is<VPValue *>()) {
8816           auto *VPV = RecipeOrValue.get<VPValue *>();
8817           Plan->addVPValue(Instr, VPV);
8818           // If the re-used value is a recipe, register the recipe for the
8819           // instruction, in case the recipe for Instr needs to be recorded.
8820           if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef()))
8821             RecipeBuilder.setRecipe(Instr, R);
8822           continue;
8823         }
8824         // Otherwise, add the new recipe.
8825         VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>();
8826         for (auto *Def : Recipe->definedValues()) {
8827           auto *UV = Def->getUnderlyingValue();
8828           Plan->addVPValue(UV, Def);
8829         }
8830 
8831         if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) &&
8832             HeaderVPBB->getFirstNonPhi() != VPBB->end()) {
8833           // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section
8834           // of the header block. That can happen for truncates of induction
8835           // variables. Those recipes are moved to the phi section of the header
8836           // block after applying SinkAfter, which relies on the original
8837           // position of the trunc.
8838           assert(isa<TruncInst>(Instr));
8839           InductionsToMove.push_back(
8840               cast<VPWidenIntOrFpInductionRecipe>(Recipe));
8841         }
8842         RecipeBuilder.setRecipe(Instr, Recipe);
8843         VPBB->appendRecipe(Recipe);
8844         continue;
8845       }
8846 
8847       // Otherwise, if all widening options failed, Instruction is to be
8848       // replicated. This may create a successor for VPBB.
8849       VPBasicBlock *NextVPBB =
8850           RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan);
8851       if (NextVPBB != VPBB) {
8852         VPBB = NextVPBB;
8853         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
8854                                     : "");
8855       }
8856     }
8857 
8858     VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
8859     VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
8860   }
8861 
8862   HeaderVPBB->setName("vector.body");
8863 
8864   // Fold the last, empty block into its predecessor.
8865   VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB);
8866   assert(VPBB && "expected to fold last (empty) block");
8867   // After here, VPBB should not be used.
8868   VPBB = nullptr;
8869 
8870   addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan);
8871 
8872   assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
8873          !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
8874          "entry block must be set to a VPRegionBlock having a non-empty entry "
8875          "VPBasicBlock");
8876   RecipeBuilder.fixHeaderPhis();
8877 
8878   // ---------------------------------------------------------------------------
8879   // Transform initial VPlan: Apply previously taken decisions, in order, to
8880   // bring the VPlan to its final state.
8881   // ---------------------------------------------------------------------------
8882 
8883   // Apply Sink-After legal constraints.
8884   auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * {
8885     auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent());
8886     if (Region && Region->isReplicator()) {
8887       assert(Region->getNumSuccessors() == 1 &&
8888              Region->getNumPredecessors() == 1 && "Expected SESE region!");
8889       assert(R->getParent()->size() == 1 &&
8890              "A recipe in an original replicator region must be the only "
8891              "recipe in its block");
8892       return Region;
8893     }
8894     return nullptr;
8895   };
8896   for (auto &Entry : SinkAfter) {
8897     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
8898     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
8899 
8900     auto *TargetRegion = GetReplicateRegion(Target);
8901     auto *SinkRegion = GetReplicateRegion(Sink);
8902     if (!SinkRegion) {
8903       // If the sink source is not a replicate region, sink the recipe directly.
8904       if (TargetRegion) {
8905         // The target is in a replication region, make sure to move Sink to
8906         // the block after it, not into the replication region itself.
8907         VPBasicBlock *NextBlock =
8908             cast<VPBasicBlock>(TargetRegion->getSuccessors().front());
8909         Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi());
8910       } else
8911         Sink->moveAfter(Target);
8912       continue;
8913     }
8914 
8915     // The sink source is in a replicate region. Unhook the region from the CFG.
8916     auto *SinkPred = SinkRegion->getSinglePredecessor();
8917     auto *SinkSucc = SinkRegion->getSingleSuccessor();
8918     VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion);
8919     VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc);
8920     VPBlockUtils::connectBlocks(SinkPred, SinkSucc);
8921 
8922     if (TargetRegion) {
8923       // The target recipe is also in a replicate region, move the sink region
8924       // after the target region.
8925       auto *TargetSucc = TargetRegion->getSingleSuccessor();
8926       VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc);
8927       VPBlockUtils::connectBlocks(TargetRegion, SinkRegion);
8928       VPBlockUtils::connectBlocks(SinkRegion, TargetSucc);
8929     } else {
8930       // The sink source is in a replicate region, we need to move the whole
8931       // replicate region, which should only contain a single recipe in the
8932       // main block.
8933       auto *SplitBlock =
8934           Target->getParent()->splitAt(std::next(Target->getIterator()));
8935 
8936       auto *SplitPred = SplitBlock->getSinglePredecessor();
8937 
8938       VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock);
8939       VPBlockUtils::connectBlocks(SplitPred, SinkRegion);
8940       VPBlockUtils::connectBlocks(SinkRegion, SplitBlock);
8941     }
8942   }
8943 
8944   VPlanTransforms::removeRedundantCanonicalIVs(*Plan);
8945   VPlanTransforms::removeRedundantInductionCasts(*Plan);
8946 
8947   // Now that sink-after is done, move induction recipes for optimized truncates
8948   // to the phi section of the header block.
8949   for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove)
8950     Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
8951 
8952   // Adjust the recipes for any inloop reductions.
8953   adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExiting()), Plan,
8954                              RecipeBuilder, Range.Start);
8955 
8956   // Introduce a recipe to combine the incoming and previous values of a
8957   // first-order recurrence.
8958   for (VPRecipeBase &R :
8959        Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
8960     auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R);
8961     if (!RecurPhi)
8962       continue;
8963 
8964     VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe();
8965     VPBasicBlock *InsertBlock = PrevRecipe->getParent();
8966     auto *Region = GetReplicateRegion(PrevRecipe);
8967     if (Region)
8968       InsertBlock = cast<VPBasicBlock>(Region->getSingleSuccessor());
8969     if (Region || PrevRecipe->isPhi())
8970       Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
8971     else
8972       Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator()));
8973 
8974     auto *RecurSplice = cast<VPInstruction>(
8975         Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice,
8976                              {RecurPhi, RecurPhi->getBackedgeValue()}));
8977 
8978     RecurPhi->replaceAllUsesWith(RecurSplice);
8979     // Set the first operand of RecurSplice to RecurPhi again, after replacing
8980     // all users.
8981     RecurSplice->setOperand(0, RecurPhi);
8982   }
8983 
8984   // Interleave memory: for each Interleave Group we marked earlier as relevant
8985   // for this VPlan, replace the Recipes widening its memory instructions with a
8986   // single VPInterleaveRecipe at its insertion point.
8987   for (auto IG : InterleaveGroups) {
8988     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
8989         RecipeBuilder.getRecipe(IG->getInsertPos()));
8990     SmallVector<VPValue *, 4> StoredValues;
8991     for (unsigned i = 0; i < IG->getFactor(); ++i)
8992       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
8993         auto *StoreR =
8994             cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI));
8995         StoredValues.push_back(StoreR->getStoredValue());
8996       }
8997 
8998     auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
8999                                         Recipe->getMask());
9000     VPIG->insertBefore(Recipe);
9001     unsigned J = 0;
9002     for (unsigned i = 0; i < IG->getFactor(); ++i)
9003       if (Instruction *Member = IG->getMember(i)) {
9004         if (!Member->getType()->isVoidTy()) {
9005           VPValue *OriginalV = Plan->getVPValue(Member);
9006           Plan->removeVPValueFor(Member);
9007           Plan->addVPValue(Member, VPIG->getVPValue(J));
9008           OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
9009           J++;
9010         }
9011         RecipeBuilder.getRecipe(Member)->eraseFromParent();
9012       }
9013   }
9014 
9015   // From this point onwards, VPlan-to-VPlan transformations may change the plan
9016   // in ways that accessing values using original IR values is incorrect.
9017   Plan->disableValue2VPValue();
9018 
9019   VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE());
9020   VPlanTransforms::sinkScalarOperands(*Plan);
9021   VPlanTransforms::mergeReplicateRegions(*Plan);
9022   VPlanTransforms::removeDeadRecipes(*Plan);
9023   VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan);
9024 
9025   std::string PlanName;
9026   raw_string_ostream RSO(PlanName);
9027   ElementCount VF = Range.Start;
9028   Plan->addVF(VF);
9029   RSO << "Initial VPlan for VF={" << VF;
9030   for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
9031     Plan->addVF(VF);
9032     RSO << "," << VF;
9033   }
9034   RSO << "},UF>=1";
9035   RSO.flush();
9036   Plan->setName(PlanName);
9037 
9038   // Fold Exit block into its predecessor if possible.
9039   // TODO: Fold block earlier once all VPlan transforms properly maintain a
9040   // VPBasicBlock as exit.
9041   VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExiting());
9042 
9043   assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
9044   return Plan;
9045 }
9046 
9047 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9048   // Outer loop handling: They may require CFG and instruction level
9049   // transformations before even evaluating whether vectorization is profitable.
9050   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
9051   // the vectorization pipeline.
9052   assert(!OrigLoop->isInnermost());
9053   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
9054 
9055   // Create new empty VPlan
9056   auto Plan = std::make_unique<VPlan>();
9057 
9058   // Build hierarchical CFG
9059   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
9060   HCFGBuilder.buildHierarchicalCFG();
9061 
9062   for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
9063        VF *= 2)
9064     Plan->addVF(VF);
9065 
9066   SmallPtrSet<Instruction *, 1> DeadInstructions;
9067   VPlanTransforms::VPInstructionsToVPRecipes(
9068       OrigLoop, Plan,
9069       [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
9070       DeadInstructions, *PSE.getSE());
9071 
9072   // Remove the existing terminator of the exiting block of the top-most region.
9073   // A BranchOnCount will be added instead when adding the canonical IV recipes.
9074   auto *Term =
9075       Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
9076   Term->eraseFromParent();
9077 
9078   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(),
9079                         true);
9080   return Plan;
9081 }
9082 
9083 // Adjust the recipes for reductions. For in-loop reductions the chain of
9084 // instructions leading from the loop exit instr to the phi need to be converted
9085 // to reductions, with one operand being vector and the other being the scalar
9086 // reduction chain. For other reductions, a select is introduced between the phi
9087 // and live-out recipes when folding the tail.
9088 void LoopVectorizationPlanner::adjustRecipesForReductions(
9089     VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
9090     ElementCount MinVF) {
9091   for (auto &Reduction : CM.getInLoopReductionChains()) {
9092     PHINode *Phi = Reduction.first;
9093     const RecurrenceDescriptor &RdxDesc =
9094         Legal->getReductionVars().find(Phi)->second;
9095     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
9096 
9097     if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc))
9098       continue;
9099 
9100     // ReductionOperations are orders top-down from the phi's use to the
9101     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
9102     // which of the two operands will remain scalar and which will be reduced.
9103     // For minmax the chain will be the select instructions.
9104     Instruction *Chain = Phi;
9105     for (Instruction *R : ReductionOperations) {
9106       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
9107       RecurKind Kind = RdxDesc.getRecurrenceKind();
9108 
9109       VPValue *ChainOp = Plan->getVPValue(Chain);
9110       unsigned FirstOpId;
9111       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
9112              "Only min/max recurrences allowed for inloop reductions");
9113       // Recognize a call to the llvm.fmuladd intrinsic.
9114       bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9115       assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) &&
9116              "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9117       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9118         assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
9119                "Expected to replace a VPWidenSelectSC");
9120         FirstOpId = 1;
9121       } else {
9122         assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) ||
9123                 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) &&
9124                "Expected to replace a VPWidenSC");
9125         FirstOpId = 0;
9126       }
9127       unsigned VecOpId =
9128           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
9129       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
9130 
9131       auto *CondOp = CM.blockNeedsPredicationForAnyReason(R->getParent())
9132                          ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
9133                          : nullptr;
9134 
9135       if (IsFMulAdd) {
9136         // If the instruction is a call to the llvm.fmuladd intrinsic then we
9137         // need to create an fmul recipe to use as the vector operand for the
9138         // fadd reduction.
9139         VPInstruction *FMulRecipe = new VPInstruction(
9140             Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))});
9141         FMulRecipe->setFastMathFlags(R->getFastMathFlags());
9142         WidenRecipe->getParent()->insert(FMulRecipe,
9143                                          WidenRecipe->getIterator());
9144         VecOp = FMulRecipe;
9145       }
9146       VPReductionRecipe *RedRecipe =
9147           new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI);
9148       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9149       Plan->removeVPValueFor(R);
9150       Plan->addVPValue(R, RedRecipe);
9151       WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
9152       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9153       WidenRecipe->eraseFromParent();
9154 
9155       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9156         VPRecipeBase *CompareRecipe =
9157             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
9158         assert(isa<VPWidenRecipe>(CompareRecipe) &&
9159                "Expected to replace a VPWidenSC");
9160         assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 &&
9161                "Expected no remaining users");
9162         CompareRecipe->eraseFromParent();
9163       }
9164       Chain = R;
9165     }
9166   }
9167 
9168   // If tail is folded by masking, introduce selects between the phi
9169   // and the live-out instruction of each reduction, at the beginning of the
9170   // dedicated latch block.
9171   if (CM.foldTailByMasking()) {
9172     Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin());
9173     for (VPRecipeBase &R :
9174          Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9175       VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9176       if (!PhiR || PhiR->isInLoop())
9177         continue;
9178       VPValue *Cond =
9179           RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
9180       VPValue *Red = PhiR->getBackedgeValue();
9181       assert(cast<VPRecipeBase>(Red->getDef())->getParent() != LatchVPBB &&
9182              "reduction recipe must be defined before latch");
9183       Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR});
9184     }
9185   }
9186 }
9187 
9188 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
9189 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
9190                                VPSlotTracker &SlotTracker) const {
9191   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9192   IG->getInsertPos()->printAsOperand(O, false);
9193   O << ", ";
9194   getAddr()->printAsOperand(O, SlotTracker);
9195   VPValue *Mask = getMask();
9196   if (Mask) {
9197     O << ", ";
9198     Mask->printAsOperand(O, SlotTracker);
9199   }
9200 
9201   unsigned OpIdx = 0;
9202   for (unsigned i = 0; i < IG->getFactor(); ++i) {
9203     if (!IG->getMember(i))
9204       continue;
9205     if (getNumStoreOperands() > 0) {
9206       O << "\n" << Indent << "  store ";
9207       getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
9208       O << " to index " << i;
9209     } else {
9210       O << "\n" << Indent << "  ";
9211       getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
9212       O << " = load from index " << i;
9213     }
9214     ++OpIdx;
9215   }
9216 }
9217 #endif
9218 
9219 void VPWidenCallRecipe::execute(VPTransformState &State) {
9220   State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
9221                                   *this, State);
9222 }
9223 
9224 void VPWidenSelectRecipe::execute(VPTransformState &State) {
9225   auto &I = *cast<SelectInst>(getUnderlyingInstr());
9226   State.ILV->setDebugLocFromInst(&I);
9227 
9228   // The condition can be loop invariant  but still defined inside the
9229   // loop. This means that we can't just use the original 'cond' value.
9230   // We have to take the 'vectorized' value and pick the first lane.
9231   // Instcombine will make this a no-op.
9232   auto *InvarCond =
9233       InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr;
9234 
9235   for (unsigned Part = 0; Part < State.UF; ++Part) {
9236     Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part);
9237     Value *Op0 = State.get(getOperand(1), Part);
9238     Value *Op1 = State.get(getOperand(2), Part);
9239     Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
9240     State.set(this, Sel, Part);
9241     State.ILV->addMetadata(Sel, &I);
9242   }
9243 }
9244 
9245 void VPWidenRecipe::execute(VPTransformState &State) {
9246   auto &I = *cast<Instruction>(getUnderlyingValue());
9247   auto &Builder = State.Builder;
9248   switch (I.getOpcode()) {
9249   case Instruction::Call:
9250   case Instruction::Br:
9251   case Instruction::PHI:
9252   case Instruction::GetElementPtr:
9253   case Instruction::Select:
9254     llvm_unreachable("This instruction is handled by a different recipe.");
9255   case Instruction::UDiv:
9256   case Instruction::SDiv:
9257   case Instruction::SRem:
9258   case Instruction::URem:
9259   case Instruction::Add:
9260   case Instruction::FAdd:
9261   case Instruction::Sub:
9262   case Instruction::FSub:
9263   case Instruction::FNeg:
9264   case Instruction::Mul:
9265   case Instruction::FMul:
9266   case Instruction::FDiv:
9267   case Instruction::FRem:
9268   case Instruction::Shl:
9269   case Instruction::LShr:
9270   case Instruction::AShr:
9271   case Instruction::And:
9272   case Instruction::Or:
9273   case Instruction::Xor: {
9274     // Just widen unops and binops.
9275     State.ILV->setDebugLocFromInst(&I);
9276 
9277     for (unsigned Part = 0; Part < State.UF; ++Part) {
9278       SmallVector<Value *, 2> Ops;
9279       for (VPValue *VPOp : operands())
9280         Ops.push_back(State.get(VPOp, Part));
9281 
9282       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
9283 
9284       if (auto *VecOp = dyn_cast<Instruction>(V)) {
9285         VecOp->copyIRFlags(&I);
9286 
9287         // If the instruction is vectorized and was in a basic block that needed
9288         // predication, we can't propagate poison-generating flags (nuw/nsw,
9289         // exact, etc.). The control flow has been linearized and the
9290         // instruction is no longer guarded by the predicate, which could make
9291         // the flag properties to no longer hold.
9292         if (State.MayGeneratePoisonRecipes.contains(this))
9293           VecOp->dropPoisonGeneratingFlags();
9294       }
9295 
9296       // Use this vector value for all users of the original instruction.
9297       State.set(this, V, Part);
9298       State.ILV->addMetadata(V, &I);
9299     }
9300 
9301     break;
9302   }
9303   case Instruction::Freeze: {
9304     State.ILV->setDebugLocFromInst(&I);
9305 
9306     for (unsigned Part = 0; Part < State.UF; ++Part) {
9307       Value *Op = State.get(getOperand(0), Part);
9308 
9309       Value *Freeze = Builder.CreateFreeze(Op);
9310       State.set(this, Freeze, Part);
9311     }
9312     break;
9313   }
9314   case Instruction::ICmp:
9315   case Instruction::FCmp: {
9316     // Widen compares. Generate vector compares.
9317     bool FCmp = (I.getOpcode() == Instruction::FCmp);
9318     auto *Cmp = cast<CmpInst>(&I);
9319     State.ILV->setDebugLocFromInst(Cmp);
9320     for (unsigned Part = 0; Part < State.UF; ++Part) {
9321       Value *A = State.get(getOperand(0), Part);
9322       Value *B = State.get(getOperand(1), Part);
9323       Value *C = nullptr;
9324       if (FCmp) {
9325         // Propagate fast math flags.
9326         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
9327         Builder.setFastMathFlags(Cmp->getFastMathFlags());
9328         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
9329       } else {
9330         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
9331       }
9332       State.set(this, C, Part);
9333       State.ILV->addMetadata(C, &I);
9334     }
9335 
9336     break;
9337   }
9338 
9339   case Instruction::ZExt:
9340   case Instruction::SExt:
9341   case Instruction::FPToUI:
9342   case Instruction::FPToSI:
9343   case Instruction::FPExt:
9344   case Instruction::PtrToInt:
9345   case Instruction::IntToPtr:
9346   case Instruction::SIToFP:
9347   case Instruction::UIToFP:
9348   case Instruction::Trunc:
9349   case Instruction::FPTrunc:
9350   case Instruction::BitCast: {
9351     auto *CI = cast<CastInst>(&I);
9352     State.ILV->setDebugLocFromInst(CI);
9353 
9354     /// Vectorize casts.
9355     Type *DestTy = (State.VF.isScalar())
9356                        ? CI->getType()
9357                        : VectorType::get(CI->getType(), State.VF);
9358 
9359     for (unsigned Part = 0; Part < State.UF; ++Part) {
9360       Value *A = State.get(getOperand(0), Part);
9361       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
9362       State.set(this, Cast, Part);
9363       State.ILV->addMetadata(Cast, &I);
9364     }
9365     break;
9366   }
9367   default:
9368     // This instruction is not vectorized by simple widening.
9369     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
9370     llvm_unreachable("Unhandled instruction!");
9371   } // end of switch.
9372 }
9373 
9374 void VPWidenGEPRecipe::execute(VPTransformState &State) {
9375   auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
9376   // Construct a vector GEP by widening the operands of the scalar GEP as
9377   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
9378   // results in a vector of pointers when at least one operand of the GEP
9379   // is vector-typed. Thus, to keep the representation compact, we only use
9380   // vector-typed operands for loop-varying values.
9381 
9382   if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
9383     // If we are vectorizing, but the GEP has only loop-invariant operands,
9384     // the GEP we build (by only using vector-typed operands for
9385     // loop-varying values) would be a scalar pointer. Thus, to ensure we
9386     // produce a vector of pointers, we need to either arbitrarily pick an
9387     // operand to broadcast, or broadcast a clone of the original GEP.
9388     // Here, we broadcast a clone of the original.
9389     //
9390     // TODO: If at some point we decide to scalarize instructions having
9391     //       loop-invariant operands, this special case will no longer be
9392     //       required. We would add the scalarization decision to
9393     //       collectLoopScalars() and teach getVectorValue() to broadcast
9394     //       the lane-zero scalar value.
9395     auto *Clone = State.Builder.Insert(GEP->clone());
9396     for (unsigned Part = 0; Part < State.UF; ++Part) {
9397       Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone);
9398       State.set(this, EntryPart, Part);
9399       State.ILV->addMetadata(EntryPart, GEP);
9400     }
9401   } else {
9402     // If the GEP has at least one loop-varying operand, we are sure to
9403     // produce a vector of pointers. But if we are only unrolling, we want
9404     // to produce a scalar GEP for each unroll part. Thus, the GEP we
9405     // produce with the code below will be scalar (if VF == 1) or vector
9406     // (otherwise). Note that for the unroll-only case, we still maintain
9407     // values in the vector mapping with initVector, as we do for other
9408     // instructions.
9409     for (unsigned Part = 0; Part < State.UF; ++Part) {
9410       // The pointer operand of the new GEP. If it's loop-invariant, we
9411       // won't broadcast it.
9412       auto *Ptr = IsPtrLoopInvariant
9413                       ? State.get(getOperand(0), VPIteration(0, 0))
9414                       : State.get(getOperand(0), Part);
9415 
9416       // Collect all the indices for the new GEP. If any index is
9417       // loop-invariant, we won't broadcast it.
9418       SmallVector<Value *, 4> Indices;
9419       for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
9420         VPValue *Operand = getOperand(I);
9421         if (IsIndexLoopInvariant[I - 1])
9422           Indices.push_back(State.get(Operand, VPIteration(0, 0)));
9423         else
9424           Indices.push_back(State.get(Operand, Part));
9425       }
9426 
9427       // If the GEP instruction is vectorized and was in a basic block that
9428       // needed predication, we can't propagate the poison-generating 'inbounds'
9429       // flag. The control flow has been linearized and the GEP is no longer
9430       // guarded by the predicate, which could make the 'inbounds' properties to
9431       // no longer hold.
9432       bool IsInBounds =
9433           GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0;
9434 
9435       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
9436       // but it should be a vector, otherwise.
9437       auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr,
9438                                              Indices, "", IsInBounds);
9439       assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
9440              "NewGEP is not a pointer vector");
9441       State.set(this, NewGEP, Part);
9442       State.ILV->addMetadata(NewGEP, GEP);
9443     }
9444   }
9445 }
9446 
9447 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
9448   assert(!State.Instance && "Int or FP induction being replicated.");
9449 
9450   Value *Start = getStartValue()->getLiveInIRValue();
9451   const InductionDescriptor &ID = getInductionDescriptor();
9452   TruncInst *Trunc = getTruncInst();
9453   IRBuilderBase &Builder = State.Builder;
9454   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
9455   assert(State.VF.isVector() && "must have vector VF");
9456 
9457   // The value from the original loop to which we are mapping the new induction
9458   // variable.
9459   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
9460 
9461   // Fast-math-flags propagate from the original induction instruction.
9462   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
9463   if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
9464     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
9465 
9466   // Now do the actual transformations, and start with fetching the step value.
9467   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9468 
9469   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
9470          "Expected either an induction phi-node or a truncate of it!");
9471 
9472   // Construct the initial value of the vector IV in the vector loop preheader
9473   auto CurrIP = Builder.saveIP();
9474   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
9475   Builder.SetInsertPoint(VectorPH->getTerminator());
9476   if (isa<TruncInst>(EntryVal)) {
9477     assert(Start->getType()->isIntegerTy() &&
9478            "Truncation requires an integer type");
9479     auto *TruncType = cast<IntegerType>(EntryVal->getType());
9480     Step = Builder.CreateTrunc(Step, TruncType);
9481     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
9482   }
9483 
9484   Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
9485   Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
9486   Value *SteppedStart = getStepVector(
9487       SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);
9488 
9489   // We create vector phi nodes for both integer and floating-point induction
9490   // variables. Here, we determine the kind of arithmetic we will perform.
9491   Instruction::BinaryOps AddOp;
9492   Instruction::BinaryOps MulOp;
9493   if (Step->getType()->isIntegerTy()) {
9494     AddOp = Instruction::Add;
9495     MulOp = Instruction::Mul;
9496   } else {
9497     AddOp = ID.getInductionOpcode();
9498     MulOp = Instruction::FMul;
9499   }
9500 
9501   // Multiply the vectorization factor by the step using integer or
9502   // floating-point arithmetic as appropriate.
9503   Type *StepType = Step->getType();
9504   Value *RuntimeVF;
9505   if (Step->getType()->isFloatingPointTy())
9506     RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
9507   else
9508     RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
9509   Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
9510 
9511   // Create a vector splat to use in the induction update.
9512   //
9513   // FIXME: If the step is non-constant, we create the vector splat with
9514   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
9515   //        handle a constant vector splat.
9516   Value *SplatVF = isa<Constant>(Mul)
9517                        ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
9518                        : Builder.CreateVectorSplat(State.VF, Mul);
9519   Builder.restoreIP(CurrIP);
9520 
9521   // We may need to add the step a number of times, depending on the unroll
9522   // factor. The last of those goes into the PHI.
9523   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
9524                                     &*State.CFG.PrevBB->getFirstInsertionPt());
9525   VecInd->setDebugLoc(EntryVal->getDebugLoc());
9526   Instruction *LastInduction = VecInd;
9527   for (unsigned Part = 0; Part < State.UF; ++Part) {
9528     State.set(this, LastInduction, Part);
9529 
9530     if (isa<TruncInst>(EntryVal))
9531       State.ILV->addMetadata(LastInduction, EntryVal);
9532 
9533     LastInduction = cast<Instruction>(
9534         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
9535     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
9536   }
9537 
9538   LastInduction->setName("vec.ind.next");
9539   VecInd->addIncoming(SteppedStart, VectorPH);
9540   // Add induction update using an incorrect block temporarily. The phi node
9541   // will be fixed after VPlan execution. Note that at this point the latch
9542   // block cannot be used, as it does not exist yet.
9543   // TODO: Model increment value in VPlan, by turning the recipe into a
9544   // multi-def and a subclass of VPHeaderPHIRecipe.
9545   VecInd->addIncoming(LastInduction, VectorPH);
9546 }
9547 
9548 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
9549   assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&
9550          "Not a pointer induction according to InductionDescriptor!");
9551   assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
9552          "Unexpected type.");
9553 
9554   auto *IVR = getParent()->getPlan()->getCanonicalIV();
9555   PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0));
9556 
9557   if (onlyScalarsGenerated(State.VF)) {
9558     // This is the normalized GEP that starts counting at zero.
9559     Value *PtrInd = State.Builder.CreateSExtOrTrunc(
9560         CanonicalIV, IndDesc.getStep()->getType());
9561     // Determine the number of scalars we need to generate for each unroll
9562     // iteration. If the instruction is uniform, we only need to generate the
9563     // first lane. Otherwise, we generate all VF values.
9564     bool IsUniform = vputils::onlyFirstLaneUsed(this);
9565     assert((IsUniform || !State.VF.isScalable()) &&
9566            "Cannot scalarize a scalable VF");
9567     unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
9568 
9569     for (unsigned Part = 0; Part < State.UF; ++Part) {
9570       Value *PartStart =
9571           createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part);
9572 
9573       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
9574         Value *Idx = State.Builder.CreateAdd(
9575             PartStart, ConstantInt::get(PtrInd->getType(), Lane));
9576         Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx);
9577 
9578         Value *Step = CreateStepValue(IndDesc.getStep(), SE,
9579                                       State.CFG.PrevBB->getTerminator());
9580         Value *SclrGep = emitTransformedIndex(
9581             State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, IndDesc);
9582         SclrGep->setName("next.gep");
9583         State.set(this, SclrGep, VPIteration(Part, Lane));
9584       }
9585     }
9586     return;
9587   }
9588 
9589   assert(isa<SCEVConstant>(IndDesc.getStep()) &&
9590          "Induction step not a SCEV constant!");
9591   Type *PhiType = IndDesc.getStep()->getType();
9592 
9593   // Build a pointer phi
9594   Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
9595   Type *ScStValueType = ScalarStartValue->getType();
9596   PHINode *NewPointerPhi =
9597       PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);
9598 
9599   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
9600   NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
9601 
9602   // A pointer induction, performed by using a gep
9603   const DataLayout &DL = NewPointerPhi->getModule()->getDataLayout();
9604   Instruction *InductionLoc = &*State.Builder.GetInsertPoint();
9605 
9606   const SCEV *ScalarStep = IndDesc.getStep();
9607   SCEVExpander Exp(SE, DL, "induction");
9608   Value *ScalarStepValue = Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
9609   Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
9610   Value *NumUnrolledElems =
9611       State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
9612   Value *InductionGEP = GetElementPtrInst::Create(
9613       IndDesc.getElementType(), NewPointerPhi,
9614       State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
9615       InductionLoc);
9616   // Add induction update using an incorrect block temporarily. The phi node
9617   // will be fixed after VPlan execution. Note that at this point the latch
9618   // block cannot be used, as it does not exist yet.
9619   // TODO: Model increment value in VPlan, by turning the recipe into a
9620   // multi-def and a subclass of VPHeaderPHIRecipe.
9621   NewPointerPhi->addIncoming(InductionGEP, VectorPH);
9622 
9623   // Create UF many actual address geps that use the pointer
9624   // phi as base and a vectorized version of the step value
9625   // (<step*0, ..., step*N>) as offset.
9626   for (unsigned Part = 0; Part < State.UF; ++Part) {
9627     Type *VecPhiType = VectorType::get(PhiType, State.VF);
9628     Value *StartOffsetScalar =
9629         State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
9630     Value *StartOffset =
9631         State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
9632     // Create a vector of consecutive numbers from zero to VF.
9633     StartOffset = State.Builder.CreateAdd(
9634         StartOffset, State.Builder.CreateStepVector(VecPhiType));
9635 
9636     Value *GEP = State.Builder.CreateGEP(
9637         IndDesc.getElementType(), NewPointerPhi,
9638         State.Builder.CreateMul(
9639             StartOffset,
9640             State.Builder.CreateVectorSplat(State.VF, ScalarStepValue),
9641             "vector.gep"));
9642     State.set(this, GEP, Part);
9643   }
9644 }
9645 
9646 void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
9647   assert(!State.Instance && "VPScalarIVStepsRecipe being replicated.");
9648 
9649   // Fast-math-flags propagate from the original induction instruction.
9650   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9651   if (IndDesc.getInductionBinOp() &&
9652       isa<FPMathOperator>(IndDesc.getInductionBinOp()))
9653     State.Builder.setFastMathFlags(
9654         IndDesc.getInductionBinOp()->getFastMathFlags());
9655 
9656   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9657   auto CreateScalarIV = [&](Value *&Step) -> Value * {
9658     Value *ScalarIV = State.get(getCanonicalIV(), VPIteration(0, 0));
9659     auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0);
9660     if (!isCanonical() || CanonicalIV->getType() != Ty) {
9661       ScalarIV =
9662           Ty->isIntegerTy()
9663               ? State.Builder.CreateSExtOrTrunc(ScalarIV, Ty)
9664               : State.Builder.CreateCast(Instruction::SIToFP, ScalarIV, Ty);
9665       ScalarIV = emitTransformedIndex(State.Builder, ScalarIV,
9666                                       getStartValue()->getLiveInIRValue(), Step,
9667                                       IndDesc);
9668       ScalarIV->setName("offset.idx");
9669     }
9670     if (TruncToTy) {
9671       assert(Step->getType()->isIntegerTy() &&
9672              "Truncation requires an integer step");
9673       ScalarIV = State.Builder.CreateTrunc(ScalarIV, TruncToTy);
9674       Step = State.Builder.CreateTrunc(Step, TruncToTy);
9675     }
9676     return ScalarIV;
9677   };
9678 
9679   Value *ScalarIV = CreateScalarIV(Step);
9680   if (State.VF.isVector()) {
9681     buildScalarSteps(ScalarIV, Step, IndDesc, this, State);
9682     return;
9683   }
9684 
9685   for (unsigned Part = 0; Part < State.UF; ++Part) {
9686     assert(!State.VF.isScalable() && "scalable vectors not yet supported.");
9687     Value *EntryPart;
9688     if (Step->getType()->isFloatingPointTy()) {
9689       Value *StartIdx =
9690           getRuntimeVFAsFloat(State.Builder, Step->getType(), State.VF * Part);
9691       // Floating-point operations inherit FMF via the builder's flags.
9692       Value *MulOp = State.Builder.CreateFMul(StartIdx, Step);
9693       EntryPart = State.Builder.CreateBinOp(IndDesc.getInductionOpcode(),
9694                                             ScalarIV, MulOp);
9695     } else {
9696       Value *StartIdx =
9697           getRuntimeVF(State.Builder, Step->getType(), State.VF * Part);
9698       EntryPart = State.Builder.CreateAdd(
9699           ScalarIV, State.Builder.CreateMul(StartIdx, Step), "induction");
9700     }
9701     State.set(this, EntryPart, Part);
9702   }
9703 }
9704 
9705 void VPWidenPHIRecipe::execute(VPTransformState &State) {
9706   State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this,
9707                                  State);
9708 }
9709 
9710 void VPBlendRecipe::execute(VPTransformState &State) {
9711   State.ILV->setDebugLocFromInst(Phi);
9712   // We know that all PHIs in non-header blocks are converted into
9713   // selects, so we don't have to worry about the insertion order and we
9714   // can just use the builder.
9715   // At this point we generate the predication tree. There may be
9716   // duplications since this is a simple recursive scan, but future
9717   // optimizations will clean it up.
9718 
9719   unsigned NumIncoming = getNumIncomingValues();
9720 
9721   // Generate a sequence of selects of the form:
9722   // SELECT(Mask3, In3,
9723   //        SELECT(Mask2, In2,
9724   //               SELECT(Mask1, In1,
9725   //                      In0)))
9726   // Note that Mask0 is never used: lanes for which no path reaches this phi and
9727   // are essentially undef are taken from In0.
9728   InnerLoopVectorizer::VectorParts Entry(State.UF);
9729   for (unsigned In = 0; In < NumIncoming; ++In) {
9730     for (unsigned Part = 0; Part < State.UF; ++Part) {
9731       // We might have single edge PHIs (blocks) - use an identity
9732       // 'select' for the first PHI operand.
9733       Value *In0 = State.get(getIncomingValue(In), Part);
9734       if (In == 0)
9735         Entry[Part] = In0; // Initialize with the first incoming value.
9736       else {
9737         // Select between the current value and the previous incoming edge
9738         // based on the incoming mask.
9739         Value *Cond = State.get(getMask(In), Part);
9740         Entry[Part] =
9741             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
9742       }
9743     }
9744   }
9745   for (unsigned Part = 0; Part < State.UF; ++Part)
9746     State.set(this, Entry[Part], Part);
9747 }
9748 
9749 void VPInterleaveRecipe::execute(VPTransformState &State) {
9750   assert(!State.Instance && "Interleave group being replicated.");
9751   State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9752                                       getStoredValues(), getMask());
9753 }
9754 
9755 void VPReductionRecipe::execute(VPTransformState &State) {
9756   assert(!State.Instance && "Reduction being replicated.");
9757   Value *PrevInChain = State.get(getChainOp(), 0);
9758   RecurKind Kind = RdxDesc->getRecurrenceKind();
9759   bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc);
9760   // Propagate the fast-math flags carried by the underlying instruction.
9761   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
9762   State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags());
9763   for (unsigned Part = 0; Part < State.UF; ++Part) {
9764     Value *NewVecOp = State.get(getVecOp(), Part);
9765     if (VPValue *Cond = getCondOp()) {
9766       Value *NewCond = State.get(Cond, Part);
9767       VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
9768       Value *Iden = RdxDesc->getRecurrenceIdentity(
9769           Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags());
9770       Value *IdenVec =
9771           State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
9772       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
9773       NewVecOp = Select;
9774     }
9775     Value *NewRed;
9776     Value *NextInChain;
9777     if (IsOrdered) {
9778       if (State.VF.isVector())
9779         NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp,
9780                                         PrevInChain);
9781       else
9782         NewRed = State.Builder.CreateBinOp(
9783             (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain,
9784             NewVecOp);
9785       PrevInChain = NewRed;
9786     } else {
9787       PrevInChain = State.get(getChainOp(), Part);
9788       NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
9789     }
9790     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9791       NextInChain =
9792           createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(),
9793                          NewRed, PrevInChain);
9794     } else if (IsOrdered)
9795       NextInChain = NewRed;
9796     else
9797       NextInChain = State.Builder.CreateBinOp(
9798           (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed,
9799           PrevInChain);
9800     State.set(this, NextInChain, Part);
9801   }
9802 }
9803 
9804 void VPReplicateRecipe::execute(VPTransformState &State) {
9805   if (State.Instance) { // Generate a single instance.
9806     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9807     State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance,
9808                                     IsPredicated, State);
9809     // Insert scalar instance packing it into a vector.
9810     if (AlsoPack && State.VF.isVector()) {
9811       // If we're constructing lane 0, initialize to start from poison.
9812       if (State.Instance->Lane.isFirstLane()) {
9813         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9814         Value *Poison = PoisonValue::get(
9815             VectorType::get(getUnderlyingValue()->getType(), State.VF));
9816         State.set(this, Poison, State.Instance->Part);
9817       }
9818       State.ILV->packScalarIntoVectorValue(this, *State.Instance, State);
9819     }
9820     return;
9821   }
9822 
9823   // Generate scalar instances for all VF lanes of all UF parts, unless the
9824   // instruction is uniform inwhich case generate only the first lane for each
9825   // of the UF parts.
9826   unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
9827   assert((!State.VF.isScalable() || IsUniform) &&
9828          "Can't scalarize a scalable vector");
9829   for (unsigned Part = 0; Part < State.UF; ++Part)
9830     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9831       State.ILV->scalarizeInstruction(getUnderlyingInstr(), this,
9832                                       VPIteration(Part, Lane), IsPredicated,
9833                                       State);
9834 }
9835 
9836 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
9837   assert(State.Instance && "Branch on Mask works only on single instance.");
9838 
9839   unsigned Part = State.Instance->Part;
9840   unsigned Lane = State.Instance->Lane.getKnownLane();
9841 
9842   Value *ConditionBit = nullptr;
9843   VPValue *BlockInMask = getMask();
9844   if (BlockInMask) {
9845     ConditionBit = State.get(BlockInMask, Part);
9846     if (ConditionBit->getType()->isVectorTy())
9847       ConditionBit = State.Builder.CreateExtractElement(
9848           ConditionBit, State.Builder.getInt32(Lane));
9849   } else // Block in mask is all-one.
9850     ConditionBit = State.Builder.getTrue();
9851 
9852   // Replace the temporary unreachable terminator with a new conditional branch,
9853   // whose two destinations will be set later when they are created.
9854   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
9855   assert(isa<UnreachableInst>(CurrentTerminator) &&
9856          "Expected to replace unreachable terminator with conditional branch.");
9857   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
9858   CondBr->setSuccessor(0, nullptr);
9859   ReplaceInstWithInst(CurrentTerminator, CondBr);
9860 }
9861 
9862 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
9863   assert(State.Instance && "Predicated instruction PHI works per instance.");
9864   Instruction *ScalarPredInst =
9865       cast<Instruction>(State.get(getOperand(0), *State.Instance));
9866   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
9867   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
9868   assert(PredicatingBB && "Predicated block has no single predecessor.");
9869   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
9870          "operand must be VPReplicateRecipe");
9871 
9872   // By current pack/unpack logic we need to generate only a single phi node: if
9873   // a vector value for the predicated instruction exists at this point it means
9874   // the instruction has vector users only, and a phi for the vector value is
9875   // needed. In this case the recipe of the predicated instruction is marked to
9876   // also do that packing, thereby "hoisting" the insert-element sequence.
9877   // Otherwise, a phi node for the scalar value is needed.
9878   unsigned Part = State.Instance->Part;
9879   if (State.hasVectorValue(getOperand(0), Part)) {
9880     Value *VectorValue = State.get(getOperand(0), Part);
9881     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
9882     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
9883     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
9884     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
9885     if (State.hasVectorValue(this, Part))
9886       State.reset(this, VPhi, Part);
9887     else
9888       State.set(this, VPhi, Part);
9889     // NOTE: Currently we need to update the value of the operand, so the next
9890     // predicated iteration inserts its generated value in the correct vector.
9891     State.reset(getOperand(0), VPhi, Part);
9892   } else {
9893     Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
9894     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
9895     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
9896                      PredicatingBB);
9897     Phi->addIncoming(ScalarPredInst, PredicatedBB);
9898     if (State.hasScalarValue(this, *State.Instance))
9899       State.reset(this, Phi, *State.Instance);
9900     else
9901       State.set(this, Phi, *State.Instance);
9902     // NOTE: Currently we need to update the value of the operand, so the next
9903     // predicated iteration inserts its generated value in the correct vector.
9904     State.reset(getOperand(0), Phi, *State.Instance);
9905   }
9906 }
9907 
9908 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
9909   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
9910 
9911   // Attempt to issue a wide load.
9912   LoadInst *LI = dyn_cast<LoadInst>(&Ingredient);
9913   StoreInst *SI = dyn_cast<StoreInst>(&Ingredient);
9914 
9915   assert((LI || SI) && "Invalid Load/Store instruction");
9916   assert((!SI || StoredValue) && "No stored value provided for widened store");
9917   assert((!LI || !StoredValue) && "Stored value provided for widened load");
9918 
9919   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9920 
9921   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9922   const Align Alignment = getLoadStoreAlignment(&Ingredient);
9923   bool CreateGatherScatter = !Consecutive;
9924 
9925   auto &Builder = State.Builder;
9926   InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);
9927   bool isMaskRequired = getMask();
9928   if (isMaskRequired)
9929     for (unsigned Part = 0; Part < State.UF; ++Part)
9930       BlockInMaskParts[Part] = State.get(getMask(), Part);
9931 
9932   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
9933     // Calculate the pointer for the specific unroll-part.
9934     GetElementPtrInst *PartPtr = nullptr;
9935 
9936     bool InBounds = false;
9937     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
9938       InBounds = gep->isInBounds();
9939     if (Reverse) {
9940       // If the address is consecutive but reversed, then the
9941       // wide store needs to start at the last vector element.
9942       // RunTimeVF =  VScale * VF.getKnownMinValue()
9943       // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
9944       Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF);
9945       // NumElt = -Part * RunTimeVF
9946       Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
9947       // LastLane = 1 - RunTimeVF
9948       Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
9949       PartPtr =
9950           cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
9951       PartPtr->setIsInBounds(InBounds);
9952       PartPtr = cast<GetElementPtrInst>(
9953           Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
9954       PartPtr->setIsInBounds(InBounds);
9955       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
9956         BlockInMaskParts[Part] =
9957             Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse");
9958     } else {
9959       Value *Increment =
9960           createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part);
9961       PartPtr = cast<GetElementPtrInst>(
9962           Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
9963       PartPtr->setIsInBounds(InBounds);
9964     }
9965 
9966     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
9967     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
9968   };
9969 
9970   // Handle Stores:
9971   if (SI) {
9972     State.ILV->setDebugLocFromInst(SI);
9973 
9974     for (unsigned Part = 0; Part < State.UF; ++Part) {
9975       Instruction *NewSI = nullptr;
9976       Value *StoredVal = State.get(StoredValue, Part);
9977       if (CreateGatherScatter) {
9978         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9979         Value *VectorGep = State.get(getAddr(), Part);
9980         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
9981                                             MaskPart);
9982       } else {
9983         if (Reverse) {
9984           // If we store to reverse consecutive memory locations, then we need
9985           // to reverse the order of elements in the stored value.
9986           StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
9987           // We don't want to update the value in the map as it might be used in
9988           // another expression. So don't call resetVectorValue(StoredVal).
9989         }
9990         auto *VecPtr =
9991             CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
9992         if (isMaskRequired)
9993           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
9994                                             BlockInMaskParts[Part]);
9995         else
9996           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
9997       }
9998       State.ILV->addMetadata(NewSI, SI);
9999     }
10000     return;
10001   }
10002 
10003   // Handle loads.
10004   assert(LI && "Must have a load instruction");
10005   State.ILV->setDebugLocFromInst(LI);
10006   for (unsigned Part = 0; Part < State.UF; ++Part) {
10007     Value *NewLI;
10008     if (CreateGatherScatter) {
10009       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
10010       Value *VectorGep = State.get(getAddr(), Part);
10011       NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
10012                                          nullptr, "wide.masked.gather");
10013       State.ILV->addMetadata(NewLI, LI);
10014     } else {
10015       auto *VecPtr =
10016           CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
10017       if (isMaskRequired)
10018         NewLI = Builder.CreateMaskedLoad(
10019             DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
10020             PoisonValue::get(DataTy), "wide.masked.load");
10021       else
10022         NewLI =
10023             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
10024 
10025       // Add metadata to the load, but setVectorValue to the reverse shuffle.
10026       State.ILV->addMetadata(NewLI, LI);
10027       if (Reverse)
10028         NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
10029     }
10030 
10031     State.set(getVPSingleValue(), NewLI, Part);
10032   }
10033 }
10034 
10035 // Determine how to lower the scalar epilogue, which depends on 1) optimising
10036 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
10037 // predication, and 4) a TTI hook that analyses whether the loop is suitable
10038 // for predication.
10039 static ScalarEpilogueLowering getScalarEpilogueLowering(
10040     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
10041     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
10042     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
10043     LoopVectorizationLegality &LVL) {
10044   // 1) OptSize takes precedence over all other options, i.e. if this is set,
10045   // don't look at hints or options, and don't request a scalar epilogue.
10046   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
10047   // LoopAccessInfo (due to code dependency and not being able to reliably get
10048   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
10049   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
10050   // versioning when the vectorization is forced, unlike hasOptSize. So revert
10051   // back to the old way and vectorize with versioning when forced. See D81345.)
10052   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
10053                                                       PGSOQueryType::IRPass) &&
10054                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
10055     return CM_ScalarEpilogueNotAllowedOptSize;
10056 
10057   // 2) If set, obey the directives
10058   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
10059     switch (PreferPredicateOverEpilogue) {
10060     case PreferPredicateTy::ScalarEpilogue:
10061       return CM_ScalarEpilogueAllowed;
10062     case PreferPredicateTy::PredicateElseScalarEpilogue:
10063       return CM_ScalarEpilogueNotNeededUsePredicate;
10064     case PreferPredicateTy::PredicateOrDontVectorize:
10065       return CM_ScalarEpilogueNotAllowedUsePredicate;
10066     };
10067   }
10068 
10069   // 3) If set, obey the hints
10070   switch (Hints.getPredicate()) {
10071   case LoopVectorizeHints::FK_Enabled:
10072     return CM_ScalarEpilogueNotNeededUsePredicate;
10073   case LoopVectorizeHints::FK_Disabled:
10074     return CM_ScalarEpilogueAllowed;
10075   };
10076 
10077   // 4) if the TTI hook indicates this is profitable, request predication.
10078   if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
10079                                        LVL.getLAI()))
10080     return CM_ScalarEpilogueNotNeededUsePredicate;
10081 
10082   return CM_ScalarEpilogueAllowed;
10083 }
10084 
10085 Value *VPTransformState::get(VPValue *Def, unsigned Part) {
10086   // If Values have been set for this Def return the one relevant for \p Part.
10087   if (hasVectorValue(Def, Part))
10088     return Data.PerPartOutput[Def][Part];
10089 
10090   if (!hasScalarValue(Def, {Part, 0})) {
10091     Value *IRV = Def->getLiveInIRValue();
10092     Value *B = ILV->getBroadcastInstrs(IRV);
10093     set(Def, B, Part);
10094     return B;
10095   }
10096 
10097   Value *ScalarValue = get(Def, {Part, 0});
10098   // If we aren't vectorizing, we can just copy the scalar map values over
10099   // to the vector map.
10100   if (VF.isScalar()) {
10101     set(Def, ScalarValue, Part);
10102     return ScalarValue;
10103   }
10104 
10105   auto *RepR = dyn_cast<VPReplicateRecipe>(Def);
10106   bool IsUniform = RepR && RepR->isUniform();
10107 
10108   unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;
10109   // Check if there is a scalar value for the selected lane.
10110   if (!hasScalarValue(Def, {Part, LastLane})) {
10111     // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform.
10112     assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) ||
10113             isa<VPScalarIVStepsRecipe>(Def->getDef())) &&
10114            "unexpected recipe found to be invariant");
10115     IsUniform = true;
10116     LastLane = 0;
10117   }
10118 
10119   auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane}));
10120   // Set the insert point after the last scalarized instruction or after the
10121   // last PHI, if LastInst is a PHI. This ensures the insertelement sequence
10122   // will directly follow the scalar definitions.
10123   auto OldIP = Builder.saveIP();
10124   auto NewIP =
10125       isa<PHINode>(LastInst)
10126           ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI())
10127           : std::next(BasicBlock::iterator(LastInst));
10128   Builder.SetInsertPoint(&*NewIP);
10129 
10130   // However, if we are vectorizing, we need to construct the vector values.
10131   // If the value is known to be uniform after vectorization, we can just
10132   // broadcast the scalar value corresponding to lane zero for each unroll
10133   // iteration. Otherwise, we construct the vector values using
10134   // insertelement instructions. Since the resulting vectors are stored in
10135   // State, we will only generate the insertelements once.
10136   Value *VectorValue = nullptr;
10137   if (IsUniform) {
10138     VectorValue = ILV->getBroadcastInstrs(ScalarValue);
10139     set(Def, VectorValue, Part);
10140   } else {
10141     // Initialize packing with insertelements to start from undef.
10142     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
10143     Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
10144     set(Def, Undef, Part);
10145     for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
10146       ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this);
10147     VectorValue = get(Def, Part);
10148   }
10149   Builder.restoreIP(OldIP);
10150   return VectorValue;
10151 }
10152 
10153 // Process the loop in the VPlan-native vectorization path. This path builds
10154 // VPlan upfront in the vectorization pipeline, which allows to apply
10155 // VPlan-to-VPlan transformations from the very beginning without modifying the
10156 // input LLVM IR.
10157 static bool processLoopInVPlanNativePath(
10158     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
10159     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
10160     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
10161     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
10162     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
10163     LoopVectorizationRequirements &Requirements) {
10164 
10165   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
10166     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
10167     return false;
10168   }
10169   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
10170   Function *F = L->getHeader()->getParent();
10171   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
10172 
10173   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10174       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
10175 
10176   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
10177                                 &Hints, IAI);
10178   // Use the planner for outer loop vectorization.
10179   // TODO: CM is not used at this point inside the planner. Turn CM into an
10180   // optional argument if we don't need it in the future.
10181   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints,
10182                                Requirements, ORE);
10183 
10184   // Get user vectorization factor.
10185   ElementCount UserVF = Hints.getWidth();
10186 
10187   CM.collectElementTypesForWidening();
10188 
10189   // Plan how to best vectorize, return the best VF and its cost.
10190   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
10191 
10192   // If we are stress testing VPlan builds, do not attempt to generate vector
10193   // code. Masked vector code generation support will follow soon.
10194   // Also, do not attempt to vectorize if no vector code will be produced.
10195   if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF)
10196     return false;
10197 
10198   VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10199 
10200   {
10201     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
10202                              F->getParent()->getDataLayout());
10203     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
10204                            &CM, BFI, PSI, Checks);
10205     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
10206                       << L->getHeader()->getParent()->getName() << "\"\n");
10207     LVP.executePlan(VF.Width, 1, BestPlan, LB, DT);
10208   }
10209 
10210   // Mark the loop as already vectorized to avoid vectorizing again.
10211   Hints.setAlreadyVectorized();
10212   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10213   return true;
10214 }
10215 
10216 // Emit a remark if there are stores to floats that required a floating point
10217 // extension. If the vectorized loop was generated with floating point there
10218 // will be a performance penalty from the conversion overhead and the change in
10219 // the vector width.
10220 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
10221   SmallVector<Instruction *, 4> Worklist;
10222   for (BasicBlock *BB : L->getBlocks()) {
10223     for (Instruction &Inst : *BB) {
10224       if (auto *S = dyn_cast<StoreInst>(&Inst)) {
10225         if (S->getValueOperand()->getType()->isFloatTy())
10226           Worklist.push_back(S);
10227       }
10228     }
10229   }
10230 
10231   // Traverse the floating point stores upwards searching, for floating point
10232   // conversions.
10233   SmallPtrSet<const Instruction *, 4> Visited;
10234   SmallPtrSet<const Instruction *, 4> EmittedRemark;
10235   while (!Worklist.empty()) {
10236     auto *I = Worklist.pop_back_val();
10237     if (!L->contains(I))
10238       continue;
10239     if (!Visited.insert(I).second)
10240       continue;
10241 
10242     // Emit a remark if the floating point store required a floating
10243     // point conversion.
10244     // TODO: More work could be done to identify the root cause such as a
10245     // constant or a function return type and point the user to it.
10246     if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
10247       ORE->emit([&]() {
10248         return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
10249                                           I->getDebugLoc(), L->getHeader())
10250                << "floating point conversion changes vector width. "
10251                << "Mixed floating point precision requires an up/down "
10252                << "cast that will negatively impact performance.";
10253       });
10254 
10255     for (Use &Op : I->operands())
10256       if (auto *OpI = dyn_cast<Instruction>(Op))
10257         Worklist.push_back(OpI);
10258   }
10259 }
10260 
10261 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
10262     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
10263                                !EnableLoopInterleaving),
10264       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
10265                               !EnableLoopVectorization) {}
10266 
10267 bool LoopVectorizePass::processLoop(Loop *L) {
10268   assert((EnableVPlanNativePath || L->isInnermost()) &&
10269          "VPlan-native path is not enabled. Only process inner loops.");
10270 
10271 #ifndef NDEBUG
10272   const std::string DebugLocStr = getDebugLocString(L);
10273 #endif /* NDEBUG */
10274 
10275   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
10276                     << L->getHeader()->getParent()->getName() << "' from "
10277                     << DebugLocStr << "\n");
10278 
10279   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
10280 
10281   LLVM_DEBUG(
10282       dbgs() << "LV: Loop hints:"
10283              << " force="
10284              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
10285                      ? "disabled"
10286                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
10287                             ? "enabled"
10288                             : "?"))
10289              << " width=" << Hints.getWidth()
10290              << " interleave=" << Hints.getInterleave() << "\n");
10291 
10292   // Function containing loop
10293   Function *F = L->getHeader()->getParent();
10294 
10295   // Looking at the diagnostic output is the only way to determine if a loop
10296   // was vectorized (other than looking at the IR or machine code), so it
10297   // is important to generate an optimization remark for each loop. Most of
10298   // these messages are generated as OptimizationRemarkAnalysis. Remarks
10299   // generated as OptimizationRemark and OptimizationRemarkMissed are
10300   // less verbose reporting vectorized loops and unvectorized loops that may
10301   // benefit from vectorization, respectively.
10302 
10303   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
10304     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
10305     return false;
10306   }
10307 
10308   PredicatedScalarEvolution PSE(*SE, *L);
10309 
10310   // Check if it is legal to vectorize the loop.
10311   LoopVectorizationRequirements Requirements;
10312   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
10313                                 &Requirements, &Hints, DB, AC, BFI, PSI);
10314   if (!LVL.canVectorize(EnableVPlanNativePath)) {
10315     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
10316     Hints.emitRemarkWithHints();
10317     return false;
10318   }
10319 
10320   // Check the function attributes and profiles to find out if this function
10321   // should be optimized for size.
10322   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10323       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
10324 
10325   // Entrance to the VPlan-native vectorization path. Outer loops are processed
10326   // here. They may require CFG and instruction level transformations before
10327   // even evaluating whether vectorization is profitable. Since we cannot modify
10328   // the incoming IR, we need to build VPlan upfront in the vectorization
10329   // pipeline.
10330   if (!L->isInnermost())
10331     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
10332                                         ORE, BFI, PSI, Hints, Requirements);
10333 
10334   assert(L->isInnermost() && "Inner loop expected.");
10335 
10336   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
10337   // count by optimizing for size, to minimize overheads.
10338   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
10339   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
10340     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
10341                       << "This loop is worth vectorizing only if no scalar "
10342                       << "iteration overheads are incurred.");
10343     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
10344       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
10345     else {
10346       LLVM_DEBUG(dbgs() << "\n");
10347       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
10348     }
10349   }
10350 
10351   // Check the function attributes to see if implicit floats are allowed.
10352   // FIXME: This check doesn't seem possibly correct -- what if the loop is
10353   // an integer loop and the vector instructions selected are purely integer
10354   // vector instructions?
10355   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
10356     reportVectorizationFailure(
10357         "Can't vectorize when the NoImplicitFloat attribute is used",
10358         "loop not vectorized due to NoImplicitFloat attribute",
10359         "NoImplicitFloat", ORE, L);
10360     Hints.emitRemarkWithHints();
10361     return false;
10362   }
10363 
10364   // Check if the target supports potentially unsafe FP vectorization.
10365   // FIXME: Add a check for the type of safety issue (denormal, signaling)
10366   // for the target we're vectorizing for, to make sure none of the
10367   // additional fp-math flags can help.
10368   if (Hints.isPotentiallyUnsafe() &&
10369       TTI->isFPVectorizationPotentiallyUnsafe()) {
10370     reportVectorizationFailure(
10371         "Potentially unsafe FP op prevents vectorization",
10372         "loop not vectorized due to unsafe FP support.",
10373         "UnsafeFP", ORE, L);
10374     Hints.emitRemarkWithHints();
10375     return false;
10376   }
10377 
10378   bool AllowOrderedReductions;
10379   // If the flag is set, use that instead and override the TTI behaviour.
10380   if (ForceOrderedReductions.getNumOccurrences() > 0)
10381     AllowOrderedReductions = ForceOrderedReductions;
10382   else
10383     AllowOrderedReductions = TTI->enableOrderedReductions();
10384   if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
10385     ORE->emit([&]() {
10386       auto *ExactFPMathInst = Requirements.getExactFPInst();
10387       return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
10388                                                  ExactFPMathInst->getDebugLoc(),
10389                                                  ExactFPMathInst->getParent())
10390              << "loop not vectorized: cannot prove it is safe to reorder "
10391                 "floating-point operations";
10392     });
10393     LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10394                          "reorder floating-point operations\n");
10395     Hints.emitRemarkWithHints();
10396     return false;
10397   }
10398 
10399   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
10400   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
10401 
10402   // If an override option has been passed in for interleaved accesses, use it.
10403   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
10404     UseInterleaved = EnableInterleavedMemAccesses;
10405 
10406   // Analyze interleaved memory accesses.
10407   if (UseInterleaved) {
10408     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
10409   }
10410 
10411   // Use the cost model.
10412   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10413                                 F, &Hints, IAI);
10414   CM.collectValuesToIgnore();
10415   CM.collectElementTypesForWidening();
10416 
10417   // Use the planner for vectorization.
10418   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints,
10419                                Requirements, ORE);
10420 
10421   // Get user vectorization factor and interleave count.
10422   ElementCount UserVF = Hints.getWidth();
10423   unsigned UserIC = Hints.getInterleave();
10424 
10425   // Plan how to best vectorize, return the best VF and its cost.
10426   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
10427 
10428   VectorizationFactor VF = VectorizationFactor::Disabled();
10429   unsigned IC = 1;
10430 
10431   if (MaybeVF) {
10432     if (LVP.requiresTooManyRuntimeChecks()) {
10433       ORE->emit([&]() {
10434         return OptimizationRemarkAnalysisAliasing(
10435                    DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
10436                    L->getHeader())
10437                << "loop not vectorized: cannot prove it is safe to reorder "
10438                   "memory operations";
10439       });
10440       LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
10441       Hints.emitRemarkWithHints();
10442       return false;
10443     }
10444     VF = *MaybeVF;
10445     // Select the interleave count.
10446     IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue());
10447   }
10448 
10449   // Identify the diagnostic messages that should be produced.
10450   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10451   bool VectorizeLoop = true, InterleaveLoop = true;
10452   if (VF.Width.isScalar()) {
10453     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10454     VecDiagMsg = std::make_pair(
10455         "VectorizationNotBeneficial",
10456         "the cost-model indicates that vectorization is not beneficial");
10457     VectorizeLoop = false;
10458   }
10459 
10460   if (!MaybeVF && UserIC > 1) {
10461     // Tell the user interleaving was avoided up-front, despite being explicitly
10462     // requested.
10463     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10464                          "interleaving should be avoided up front\n");
10465     IntDiagMsg = std::make_pair(
10466         "InterleavingAvoided",
10467         "Ignoring UserIC, because interleaving was avoided up front");
10468     InterleaveLoop = false;
10469   } else if (IC == 1 && UserIC <= 1) {
10470     // Tell the user interleaving is not beneficial.
10471     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10472     IntDiagMsg = std::make_pair(
10473         "InterleavingNotBeneficial",
10474         "the cost-model indicates that interleaving is not beneficial");
10475     InterleaveLoop = false;
10476     if (UserIC == 1) {
10477       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10478       IntDiagMsg.second +=
10479           " and is explicitly disabled or interleave count is set to 1";
10480     }
10481   } else if (IC > 1 && UserIC == 1) {
10482     // Tell the user interleaving is beneficial, but it explicitly disabled.
10483     LLVM_DEBUG(
10484         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10485     IntDiagMsg = std::make_pair(
10486         "InterleavingBeneficialButDisabled",
10487         "the cost-model indicates that interleaving is beneficial "
10488         "but is explicitly disabled or interleave count is set to 1");
10489     InterleaveLoop = false;
10490   }
10491 
10492   // Override IC if user provided an interleave count.
10493   IC = UserIC > 0 ? UserIC : IC;
10494 
10495   // Emit diagnostic messages, if any.
10496   const char *VAPassName = Hints.vectorizeAnalysisPassName();
10497   if (!VectorizeLoop && !InterleaveLoop) {
10498     // Do not vectorize or interleaving the loop.
10499     ORE->emit([&]() {
10500       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10501                                       L->getStartLoc(), L->getHeader())
10502              << VecDiagMsg.second;
10503     });
10504     ORE->emit([&]() {
10505       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10506                                       L->getStartLoc(), L->getHeader())
10507              << IntDiagMsg.second;
10508     });
10509     return false;
10510   } else if (!VectorizeLoop && InterleaveLoop) {
10511     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10512     ORE->emit([&]() {
10513       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10514                                         L->getStartLoc(), L->getHeader())
10515              << VecDiagMsg.second;
10516     });
10517   } else if (VectorizeLoop && !InterleaveLoop) {
10518     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10519                       << ") in " << DebugLocStr << '\n');
10520     ORE->emit([&]() {
10521       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10522                                         L->getStartLoc(), L->getHeader())
10523              << IntDiagMsg.second;
10524     });
10525   } else if (VectorizeLoop && InterleaveLoop) {
10526     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10527                       << ") in " << DebugLocStr << '\n');
10528     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10529   }
10530 
10531   bool DisableRuntimeUnroll = false;
10532   MDNode *OrigLoopID = L->getLoopID();
10533   {
10534     // Optimistically generate runtime checks. Drop them if they turn out to not
10535     // be profitable. Limit the scope of Checks, so the cleanup happens
10536     // immediately after vector codegeneration is done.
10537     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
10538                              F->getParent()->getDataLayout());
10539     if (!VF.Width.isScalar() || IC > 1)
10540       Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, IC);
10541 
10542     using namespace ore;
10543     if (!VectorizeLoop) {
10544       assert(IC > 1 && "interleave count should not be 1 or 0");
10545       // If we decided that it is not legal to vectorize the loop, then
10546       // interleave it.
10547       InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
10548                                  &CM, BFI, PSI, Checks);
10549 
10550       VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10551       LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT);
10552 
10553       ORE->emit([&]() {
10554         return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10555                                   L->getHeader())
10556                << "interleaved loop (interleaved count: "
10557                << NV("InterleaveCount", IC) << ")";
10558       });
10559     } else {
10560       // If we decided that it is *legal* to vectorize the loop, then do it.
10561 
10562       // Consider vectorizing the epilogue too if it's profitable.
10563       VectorizationFactor EpilogueVF =
10564           CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
10565       if (EpilogueVF.Width.isVector()) {
10566 
10567         // The first pass vectorizes the main loop and creates a scalar epilogue
10568         // to be vectorized by executing the plan (potentially with a different
10569         // factor) again shortly afterwards.
10570         EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
10571         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10572                                            EPI, &LVL, &CM, BFI, PSI, Checks);
10573 
10574         VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
10575         LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV,
10576                         DT);
10577         ++LoopsVectorized;
10578 
10579         // Second pass vectorizes the epilogue and adjusts the control flow
10580         // edges from the first pass.
10581         EPI.MainLoopVF = EPI.EpilogueVF;
10582         EPI.MainLoopUF = EPI.EpilogueUF;
10583         EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10584                                                  ORE, EPI, &LVL, &CM, BFI, PSI,
10585                                                  Checks);
10586 
10587         VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
10588         VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion();
10589         VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
10590         Header->setName("vec.epilog.vector.body");
10591 
10592         // Ensure that the start values for any VPReductionPHIRecipes are
10593         // updated before vectorising the epilogue loop.
10594         for (VPRecipeBase &R : Header->phis()) {
10595           if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10596             if (auto *Resume = MainILV.getReductionResumeValue(
10597                     ReductionPhi->getRecurrenceDescriptor())) {
10598               VPValue *StartVal = BestEpiPlan.getOrAddExternalDef(Resume);
10599               ReductionPhi->setOperand(0, StartVal);
10600             }
10601           }
10602         }
10603 
10604         LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10605                         DT);
10606         ++LoopsEpilogueVectorized;
10607 
10608         if (!MainILV.areSafetyChecksAdded())
10609           DisableRuntimeUnroll = true;
10610       } else {
10611         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
10612                                &LVL, &CM, BFI, PSI, Checks);
10613 
10614         VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10615         LVP.executePlan(VF.Width, IC, BestPlan, LB, DT);
10616         ++LoopsVectorized;
10617 
10618         // Add metadata to disable runtime unrolling a scalar loop when there
10619         // are no runtime checks about strides and memory. A scalar loop that is
10620         // rarely used is not worth unrolling.
10621         if (!LB.areSafetyChecksAdded())
10622           DisableRuntimeUnroll = true;
10623       }
10624       // Report the vectorization decision.
10625       ORE->emit([&]() {
10626         return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
10627                                   L->getHeader())
10628                << "vectorized loop (vectorization width: "
10629                << NV("VectorizationFactor", VF.Width)
10630                << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
10631       });
10632     }
10633 
10634     if (ORE->allowExtraAnalysis(LV_NAME))
10635       checkMixedPrecision(L, ORE);
10636   }
10637 
10638   Optional<MDNode *> RemainderLoopID =
10639       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
10640                                       LLVMLoopVectorizeFollowupEpilogue});
10641   if (RemainderLoopID.hasValue()) {
10642     L->setLoopID(RemainderLoopID.getValue());
10643   } else {
10644     if (DisableRuntimeUnroll)
10645       AddRuntimeUnrollDisableMetaData(L);
10646 
10647     // Mark the loop as already vectorized to avoid vectorizing again.
10648     Hints.setAlreadyVectorized();
10649   }
10650 
10651   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10652   return true;
10653 }
10654 
10655 LoopVectorizeResult LoopVectorizePass::runImpl(
10656     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
10657     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
10658     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
10659     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
10660     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
10661   SE = &SE_;
10662   LI = &LI_;
10663   TTI = &TTI_;
10664   DT = &DT_;
10665   BFI = &BFI_;
10666   TLI = TLI_;
10667   AA = &AA_;
10668   AC = &AC_;
10669   GetLAA = &GetLAA_;
10670   DB = &DB_;
10671   ORE = &ORE_;
10672   PSI = PSI_;
10673 
10674   // Don't attempt if
10675   // 1. the target claims to have no vector registers, and
10676   // 2. interleaving won't help ILP.
10677   //
10678   // The second condition is necessary because, even if the target has no
10679   // vector registers, loop vectorization may still enable scalar
10680   // interleaving.
10681   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10682       TTI->getMaxInterleaveFactor(1) < 2)
10683     return LoopVectorizeResult(false, false);
10684 
10685   bool Changed = false, CFGChanged = false;
10686 
10687   // The vectorizer requires loops to be in simplified form.
10688   // Since simplification may add new inner loops, it has to run before the
10689   // legality and profitability checks. This means running the loop vectorizer
10690   // will simplify all loops, regardless of whether anything end up being
10691   // vectorized.
10692   for (auto &L : *LI)
10693     Changed |= CFGChanged |=
10694         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10695 
10696   // Build up a worklist of inner-loops to vectorize. This is necessary as
10697   // the act of vectorizing or partially unrolling a loop creates new loops
10698   // and can invalidate iterators across the loops.
10699   SmallVector<Loop *, 8> Worklist;
10700 
10701   for (Loop *L : *LI)
10702     collectSupportedLoops(*L, LI, ORE, Worklist);
10703 
10704   LoopsAnalyzed += Worklist.size();
10705 
10706   // Now walk the identified inner loops.
10707   while (!Worklist.empty()) {
10708     Loop *L = Worklist.pop_back_val();
10709 
10710     // For the inner loops we actually process, form LCSSA to simplify the
10711     // transform.
10712     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10713 
10714     Changed |= CFGChanged |= processLoop(L);
10715   }
10716 
10717   // Process each loop nest in the function.
10718   return LoopVectorizeResult(Changed, CFGChanged);
10719 }
10720 
10721 PreservedAnalyses LoopVectorizePass::run(Function &F,
10722                                          FunctionAnalysisManager &AM) {
10723     auto &LI = AM.getResult<LoopAnalysis>(F);
10724     // There are no loops in the function. Return before computing other expensive
10725     // analyses.
10726     if (LI.empty())
10727       return PreservedAnalyses::all();
10728     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
10729     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
10730     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
10731     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
10732     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
10733     auto &AA = AM.getResult<AAManager>(F);
10734     auto &AC = AM.getResult<AssumptionAnalysis>(F);
10735     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
10736     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
10737 
10738     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
10739     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
10740         [&](Loop &L) -> const LoopAccessInfo & {
10741       LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,      SE,
10742                                         TLI, TTI, nullptr, nullptr, nullptr};
10743       return LAM.getResult<LoopAccessAnalysis>(L, AR);
10744     };
10745     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10746     ProfileSummaryInfo *PSI =
10747         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10748     LoopVectorizeResult Result =
10749         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
10750     if (!Result.MadeAnyChange)
10751       return PreservedAnalyses::all();
10752     PreservedAnalyses PA;
10753 
10754     // We currently do not preserve loopinfo/dominator analyses with outer loop
10755     // vectorization. Until this is addressed, mark these analyses as preserved
10756     // only for non-VPlan-native path.
10757     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
10758     if (!EnableVPlanNativePath) {
10759       PA.preserve<LoopAnalysis>();
10760       PA.preserve<DominatorTreeAnalysis>();
10761     }
10762 
10763     if (Result.MadeCFGChange) {
10764       // Making CFG changes likely means a loop got vectorized. Indicate that
10765       // extra simplification passes should be run.
10766       // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10767       // be run if runtime checks have been added.
10768       AM.getResult<ShouldRunExtraVectorPasses>(F);
10769       PA.preserve<ShouldRunExtraVectorPasses>();
10770     } else {
10771       PA.preserveSet<CFGAnalyses>();
10772     }
10773     return PA;
10774 }
10775 
10776 void LoopVectorizePass::printPipeline(
10777     raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10778   static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10779       OS, MapClassName2PassName);
10780 
10781   OS << "<";
10782   OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10783   OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10784   OS << ">";
10785 }
10786