1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanTransforms.h"
62 #include "llvm/ADT/APInt.h"
63 #include "llvm/ADT/ArrayRef.h"
64 #include "llvm/ADT/DenseMap.h"
65 #include "llvm/ADT/DenseMapInfo.h"
66 #include "llvm/ADT/Hashing.h"
67 #include "llvm/ADT/MapVector.h"
68 #include "llvm/ADT/None.h"
69 #include "llvm/ADT/Optional.h"
70 #include "llvm/ADT/STLExtras.h"
71 #include "llvm/ADT/SmallPtrSet.h"
72 #include "llvm/ADT/SmallSet.h"
73 #include "llvm/ADT/SmallVector.h"
74 #include "llvm/ADT/Statistic.h"
75 #include "llvm/ADT/StringRef.h"
76 #include "llvm/ADT/Twine.h"
77 #include "llvm/ADT/iterator_range.h"
78 #include "llvm/Analysis/AssumptionCache.h"
79 #include "llvm/Analysis/BasicAliasAnalysis.h"
80 #include "llvm/Analysis/BlockFrequencyInfo.h"
81 #include "llvm/Analysis/CFG.h"
82 #include "llvm/Analysis/CodeMetrics.h"
83 #include "llvm/Analysis/DemandedBits.h"
84 #include "llvm/Analysis/GlobalsModRef.h"
85 #include "llvm/Analysis/LoopAccessAnalysis.h"
86 #include "llvm/Analysis/LoopAnalysisManager.h"
87 #include "llvm/Analysis/LoopInfo.h"
88 #include "llvm/Analysis/LoopIterator.h"
89 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
90 #include "llvm/Analysis/ProfileSummaryInfo.h"
91 #include "llvm/Analysis/ScalarEvolution.h"
92 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
93 #include "llvm/Analysis/TargetLibraryInfo.h"
94 #include "llvm/Analysis/TargetTransformInfo.h"
95 #include "llvm/Analysis/VectorUtils.h"
96 #include "llvm/IR/Attributes.h"
97 #include "llvm/IR/BasicBlock.h"
98 #include "llvm/IR/CFG.h"
99 #include "llvm/IR/Constant.h"
100 #include "llvm/IR/Constants.h"
101 #include "llvm/IR/DataLayout.h"
102 #include "llvm/IR/DebugInfoMetadata.h"
103 #include "llvm/IR/DebugLoc.h"
104 #include "llvm/IR/DerivedTypes.h"
105 #include "llvm/IR/DiagnosticInfo.h"
106 #include "llvm/IR/Dominators.h"
107 #include "llvm/IR/Function.h"
108 #include "llvm/IR/IRBuilder.h"
109 #include "llvm/IR/InstrTypes.h"
110 #include "llvm/IR/Instruction.h"
111 #include "llvm/IR/Instructions.h"
112 #include "llvm/IR/IntrinsicInst.h"
113 #include "llvm/IR/Intrinsics.h"
114 #include "llvm/IR/Metadata.h"
115 #include "llvm/IR/Module.h"
116 #include "llvm/IR/Operator.h"
117 #include "llvm/IR/PatternMatch.h"
118 #include "llvm/IR/Type.h"
119 #include "llvm/IR/Use.h"
120 #include "llvm/IR/User.h"
121 #include "llvm/IR/Value.h"
122 #include "llvm/IR/ValueHandle.h"
123 #include "llvm/IR/Verifier.h"
124 #include "llvm/InitializePasses.h"
125 #include "llvm/Pass.h"
126 #include "llvm/Support/Casting.h"
127 #include "llvm/Support/CommandLine.h"
128 #include "llvm/Support/Compiler.h"
129 #include "llvm/Support/Debug.h"
130 #include "llvm/Support/ErrorHandling.h"
131 #include "llvm/Support/InstructionCost.h"
132 #include "llvm/Support/MathExtras.h"
133 #include "llvm/Support/raw_ostream.h"
134 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
135 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
136 #include "llvm/Transforms/Utils/LoopSimplify.h"
137 #include "llvm/Transforms/Utils/LoopUtils.h"
138 #include "llvm/Transforms/Utils/LoopVersioning.h"
139 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
140 #include "llvm/Transforms/Utils/SizeOpts.h"
141 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
142 #include <algorithm>
143 #include <cassert>
144 #include <cstdint>
145 #include <functional>
146 #include <iterator>
147 #include <limits>
148 #include <map>
149 #include <memory>
150 #include <string>
151 #include <tuple>
152 #include <utility>
153 
154 using namespace llvm;
155 
156 #define LV_NAME "loop-vectorize"
157 #define DEBUG_TYPE LV_NAME
158 
159 #ifndef NDEBUG
160 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
161 #endif
162 
163 /// @{
164 /// Metadata attribute names
165 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
166 const char LLVMLoopVectorizeFollowupVectorized[] =
167     "llvm.loop.vectorize.followup_vectorized";
168 const char LLVMLoopVectorizeFollowupEpilogue[] =
169     "llvm.loop.vectorize.followup_epilogue";
170 /// @}
171 
172 STATISTIC(LoopsVectorized, "Number of loops vectorized");
173 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
174 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
175 
176 static cl::opt<bool> EnableEpilogueVectorization(
177     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
178     cl::desc("Enable vectorization of epilogue loops."));
179 
180 static cl::opt<unsigned> EpilogueVectorizationForceVF(
181     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
182     cl::desc("When epilogue vectorization is enabled, and a value greater than "
183              "1 is specified, forces the given VF for all applicable epilogue "
184              "loops."));
185 
186 static cl::opt<unsigned> EpilogueVectorizationMinVF(
187     "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
188     cl::desc("Only loops with vectorization factor equal to or larger than "
189              "the specified value are considered for epilogue vectorization."));
190 
191 /// Loops with a known constant trip count below this number are vectorized only
192 /// if no scalar iteration overheads are incurred.
193 static cl::opt<unsigned> TinyTripCountVectorThreshold(
194     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
195     cl::desc("Loops with a constant trip count that is smaller than this "
196              "value are vectorized only if no scalar iteration overheads "
197              "are incurred."));
198 
199 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
200     "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
201     cl::desc("The maximum allowed number of runtime memory checks with a "
202              "vectorize(enable) pragma."));
203 
204 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
205 // that predication is preferred, and this lists all options. I.e., the
206 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
207 // and predicate the instructions accordingly. If tail-folding fails, there are
208 // different fallback strategies depending on these values:
209 namespace PreferPredicateTy {
210   enum Option {
211     ScalarEpilogue = 0,
212     PredicateElseScalarEpilogue,
213     PredicateOrDontVectorize
214   };
215 } // namespace PreferPredicateTy
216 
217 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
218     "prefer-predicate-over-epilogue",
219     cl::init(PreferPredicateTy::ScalarEpilogue),
220     cl::Hidden,
221     cl::desc("Tail-folding and predication preferences over creating a scalar "
222              "epilogue loop."),
223     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
224                          "scalar-epilogue",
225                          "Don't tail-predicate loops, create scalar epilogue"),
226               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
227                          "predicate-else-scalar-epilogue",
228                          "prefer tail-folding, create scalar epilogue if tail "
229                          "folding fails."),
230               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
231                          "predicate-dont-vectorize",
232                          "prefers tail-folding, don't attempt vectorization if "
233                          "tail-folding fails.")));
234 
235 static cl::opt<bool> MaximizeBandwidth(
236     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
237     cl::desc("Maximize bandwidth when selecting vectorization factor which "
238              "will be determined by the smallest type in loop."));
239 
240 static cl::opt<bool> EnableInterleavedMemAccesses(
241     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
242     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
243 
244 /// An interleave-group may need masking if it resides in a block that needs
245 /// predication, or in order to mask away gaps.
246 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
247     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
248     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
249 
250 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
251     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
252     cl::desc("We don't interleave loops with a estimated constant trip count "
253              "below this number"));
254 
255 static cl::opt<unsigned> ForceTargetNumScalarRegs(
256     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
257     cl::desc("A flag that overrides the target's number of scalar registers."));
258 
259 static cl::opt<unsigned> ForceTargetNumVectorRegs(
260     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
261     cl::desc("A flag that overrides the target's number of vector registers."));
262 
263 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
264     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
265     cl::desc("A flag that overrides the target's max interleave factor for "
266              "scalar loops."));
267 
268 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
269     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
270     cl::desc("A flag that overrides the target's max interleave factor for "
271              "vectorized loops."));
272 
273 static cl::opt<unsigned> ForceTargetInstructionCost(
274     "force-target-instruction-cost", cl::init(0), cl::Hidden,
275     cl::desc("A flag that overrides the target's expected cost for "
276              "an instruction to a single constant value. Mostly "
277              "useful for getting consistent testing."));
278 
279 static cl::opt<bool> ForceTargetSupportsScalableVectors(
280     "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
281     cl::desc(
282         "Pretend that scalable vectors are supported, even if the target does "
283         "not support them. This flag should only be used for testing."));
284 
285 static cl::opt<unsigned> SmallLoopCost(
286     "small-loop-cost", cl::init(20), cl::Hidden,
287     cl::desc(
288         "The cost of a loop that is considered 'small' by the interleaver."));
289 
290 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
291     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
292     cl::desc("Enable the use of the block frequency analysis to access PGO "
293              "heuristics minimizing code growth in cold regions and being more "
294              "aggressive in hot regions."));
295 
296 // Runtime interleave loops for load/store throughput.
297 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
298     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
299     cl::desc(
300         "Enable runtime interleaving until load/store ports are saturated"));
301 
302 /// Interleave small loops with scalar reductions.
303 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
304     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
305     cl::desc("Enable interleaving for loops with small iteration counts that "
306              "contain scalar reductions to expose ILP."));
307 
308 /// The number of stores in a loop that are allowed to need predication.
309 static cl::opt<unsigned> NumberOfStoresToPredicate(
310     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
311     cl::desc("Max number of stores to be predicated behind an if."));
312 
313 static cl::opt<bool> EnableIndVarRegisterHeur(
314     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
315     cl::desc("Count the induction variable only once when interleaving"));
316 
317 static cl::opt<bool> EnableCondStoresVectorization(
318     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
319     cl::desc("Enable if predication of stores during vectorization."));
320 
321 static cl::opt<unsigned> MaxNestedScalarReductionIC(
322     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
323     cl::desc("The maximum interleave count to use when interleaving a scalar "
324              "reduction in a nested loop."));
325 
326 static cl::opt<bool>
327     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
328                            cl::Hidden,
329                            cl::desc("Prefer in-loop vector reductions, "
330                                     "overriding the targets preference."));
331 
332 static cl::opt<bool> ForceOrderedReductions(
333     "force-ordered-reductions", cl::init(false), cl::Hidden,
334     cl::desc("Enable the vectorisation of loops with in-order (strict) "
335              "FP reductions"));
336 
337 static cl::opt<bool> PreferPredicatedReductionSelect(
338     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
339     cl::desc(
340         "Prefer predicating a reduction operation over an after loop select."));
341 
342 cl::opt<bool> EnableVPlanNativePath(
343     "enable-vplan-native-path", cl::init(false), cl::Hidden,
344     cl::desc("Enable VPlan-native vectorization path with "
345              "support for outer loop vectorization."));
346 
347 // This flag enables the stress testing of the VPlan H-CFG construction in the
348 // VPlan-native vectorization path. It must be used in conjuction with
349 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
350 // verification of the H-CFGs built.
351 static cl::opt<bool> VPlanBuildStressTest(
352     "vplan-build-stress-test", cl::init(false), cl::Hidden,
353     cl::desc(
354         "Build VPlan for every supported loop nest in the function and bail "
355         "out right after the build (stress test the VPlan H-CFG construction "
356         "in the VPlan-native vectorization path)."));
357 
358 cl::opt<bool> llvm::EnableLoopInterleaving(
359     "interleave-loops", cl::init(true), cl::Hidden,
360     cl::desc("Enable loop interleaving in Loop vectorization passes"));
361 cl::opt<bool> llvm::EnableLoopVectorization(
362     "vectorize-loops", cl::init(true), cl::Hidden,
363     cl::desc("Run the Loop vectorization passes"));
364 
365 cl::opt<bool> PrintVPlansInDotFormat(
366     "vplan-print-in-dot-format", cl::init(false), cl::Hidden,
367     cl::desc("Use dot format instead of plain text when dumping VPlans"));
368 
369 /// A helper function that returns true if the given type is irregular. The
370 /// type is irregular if its allocated size doesn't equal the store size of an
371 /// element of the corresponding vector type.
372 static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
373   // Determine if an array of N elements of type Ty is "bitcast compatible"
374   // with a <N x Ty> vector.
375   // This is only true if there is no padding between the array elements.
376   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
377 }
378 
379 /// A helper function that returns the reciprocal of the block probability of
380 /// predicated blocks. If we return X, we are assuming the predicated block
381 /// will execute once for every X iterations of the loop header.
382 ///
383 /// TODO: We should use actual block probability here, if available. Currently,
384 ///       we always assume predicated blocks have a 50% chance of executing.
385 static unsigned getReciprocalPredBlockProb() { return 2; }
386 
387 /// A helper function that returns an integer or floating-point constant with
388 /// value C.
389 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
390   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
391                            : ConstantFP::get(Ty, C);
392 }
393 
394 /// Returns "best known" trip count for the specified loop \p L as defined by
395 /// the following procedure:
396 ///   1) Returns exact trip count if it is known.
397 ///   2) Returns expected trip count according to profile data if any.
398 ///   3) Returns upper bound estimate if it is known.
399 ///   4) Returns None if all of the above failed.
400 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
401   // Check if exact trip count is known.
402   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
403     return ExpectedTC;
404 
405   // Check if there is an expected trip count available from profile data.
406   if (LoopVectorizeWithBlockFrequency)
407     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
408       return EstimatedTC;
409 
410   // Check if upper bound estimate is known.
411   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
412     return ExpectedTC;
413 
414   return None;
415 }
416 
417 // Forward declare GeneratedRTChecks.
418 class GeneratedRTChecks;
419 
420 namespace llvm {
421 
422 AnalysisKey ShouldRunExtraVectorPasses::Key;
423 
424 /// InnerLoopVectorizer vectorizes loops which contain only one basic
425 /// block to a specified vectorization factor (VF).
426 /// This class performs the widening of scalars into vectors, or multiple
427 /// scalars. This class also implements the following features:
428 /// * It inserts an epilogue loop for handling loops that don't have iteration
429 ///   counts that are known to be a multiple of the vectorization factor.
430 /// * It handles the code generation for reduction variables.
431 /// * Scalarization (implementation using scalars) of un-vectorizable
432 ///   instructions.
433 /// InnerLoopVectorizer does not perform any vectorization-legality
434 /// checks, and relies on the caller to check for the different legality
435 /// aspects. The InnerLoopVectorizer relies on the
436 /// LoopVectorizationLegality class to provide information about the induction
437 /// and reduction variables that were found to a given vectorization factor.
438 class InnerLoopVectorizer {
439 public:
440   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
441                       LoopInfo *LI, DominatorTree *DT,
442                       const TargetLibraryInfo *TLI,
443                       const TargetTransformInfo *TTI, AssumptionCache *AC,
444                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
445                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
446                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
447                       ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
448       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
449         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
450         Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
451         PSI(PSI), RTChecks(RTChecks) {
452     // Query this against the original loop and save it here because the profile
453     // of the original loop header may change as the transformation happens.
454     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
455         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
456   }
457 
458   virtual ~InnerLoopVectorizer() = default;
459 
460   /// Create a new empty loop that will contain vectorized instructions later
461   /// on, while the old loop will be used as the scalar remainder. Control flow
462   /// is generated around the vectorized (and scalar epilogue) loops consisting
463   /// of various checks and bypasses. Return the pre-header block of the new
464   /// loop and the start value for the canonical induction, if it is != 0. The
465   /// latter is the case when vectorizing the epilogue loop. In the case of
466   /// epilogue vectorization, this function is overriden to handle the more
467   /// complex control flow around the loops.
468   virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton();
469 
470   /// Widen a single call instruction within the innermost loop.
471   void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
472                             VPTransformState &State);
473 
474   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
475   void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
476 
477   // Return true if any runtime check is added.
478   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
479 
480   /// A type for vectorized values in the new loop. Each value from the
481   /// original loop, when vectorized, is represented by UF vector values in the
482   /// new unrolled loop, where UF is the unroll factor.
483   using VectorParts = SmallVector<Value *, 2>;
484 
485   /// Vectorize a single vector PHINode in a block in the VPlan-native path
486   /// only.
487   void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR,
488                            VPTransformState &State);
489 
490   /// A helper function to scalarize a single Instruction in the innermost loop.
491   /// Generates a sequence of scalar instances for each lane between \p MinLane
492   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
493   /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
494   /// Instr's operands.
495   void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe,
496                             const VPIteration &Instance, bool IfPredicateInstr,
497                             VPTransformState &State);
498 
499   /// Construct the vector value of a scalarized value \p V one lane at a time.
500   void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
501                                  VPTransformState &State);
502 
503   /// Try to vectorize interleaved access group \p Group with the base address
504   /// given in \p Addr, optionally masking the vector operations if \p
505   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
506   /// values in the vectorized loop.
507   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
508                                 ArrayRef<VPValue *> VPDefs,
509                                 VPTransformState &State, VPValue *Addr,
510                                 ArrayRef<VPValue *> StoredValues,
511                                 VPValue *BlockInMask = nullptr);
512 
513   /// Set the debug location in the builder \p Ptr using the debug location in
514   /// \p V. If \p Ptr is None then it uses the class member's Builder.
515   void setDebugLocFromInst(const Value *V,
516                            Optional<IRBuilderBase *> CustomBuilder = None);
517 
518   /// Fix the non-induction PHIs in \p Plan.
519   void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);
520 
521   /// Returns true if the reordering of FP operations is not allowed, but we are
522   /// able to vectorize with strict in-order reductions for the given RdxDesc.
523   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
524 
525   /// Create a broadcast instruction. This method generates a broadcast
526   /// instruction (shuffle) for loop invariant values and for the induction
527   /// value. If this is the induction variable then we extend it to N, N+1, ...
528   /// this is needed because each iteration in the loop corresponds to a SIMD
529   /// element.
530   virtual Value *getBroadcastInstrs(Value *V);
531 
532   /// Add metadata from one instruction to another.
533   ///
534   /// This includes both the original MDs from \p From and additional ones (\see
535   /// addNewMetadata).  Use this for *newly created* instructions in the vector
536   /// loop.
537   void addMetadata(Instruction *To, Instruction *From);
538 
539   /// Similar to the previous function but it adds the metadata to a
540   /// vector of instructions.
541   void addMetadata(ArrayRef<Value *> To, Instruction *From);
542 
543   // Returns the resume value (bc.merge.rdx) for a reduction as
544   // generated by fixReduction.
545   PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc);
546 
547 protected:
548   friend class LoopVectorizationPlanner;
549 
550   /// A small list of PHINodes.
551   using PhiVector = SmallVector<PHINode *, 4>;
552 
553   /// A type for scalarized values in the new loop. Each value from the
554   /// original loop, when scalarized, is represented by UF x VF scalar values
555   /// in the new unrolled loop, where UF is the unroll factor and VF is the
556   /// vectorization factor.
557   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
558 
559   /// Set up the values of the IVs correctly when exiting the vector loop.
560   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
561                     Value *VectorTripCount, Value *EndValue,
562                     BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
563                     VPlan &Plan);
564 
565   /// Handle all cross-iteration phis in the header.
566   void fixCrossIterationPHIs(VPTransformState &State);
567 
568   /// Create the exit value of first order recurrences in the middle block and
569   /// update their users.
570   void fixFirstOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
571                                VPTransformState &State);
572 
573   /// Create code for the loop exit value of the reduction.
574   void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
575 
576   /// Clear NSW/NUW flags from reduction instructions if necessary.
577   void clearReductionWrapFlags(VPReductionPHIRecipe *PhiR,
578                                VPTransformState &State);
579 
580   /// Iteratively sink the scalarized operands of a predicated instruction into
581   /// the block that was created for it.
582   void sinkScalarOperands(Instruction *PredInst);
583 
584   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
585   /// represented as.
586   void truncateToMinimalBitwidths(VPTransformState &State);
587 
588   /// Returns (and creates if needed) the original loop trip count.
589   Value *getOrCreateTripCount(BasicBlock *InsertBlock);
590 
591   /// Returns (and creates if needed) the trip count of the widened loop.
592   Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
593 
594   /// Returns a bitcasted value to the requested vector type.
595   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
596   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
597                                 const DataLayout &DL);
598 
599   /// Emit a bypass check to see if the vector trip count is zero, including if
600   /// it overflows.
601   void emitIterationCountCheck(BasicBlock *Bypass);
602 
603   /// Emit a bypass check to see if all of the SCEV assumptions we've
604   /// had to make are correct. Returns the block containing the checks or
605   /// nullptr if no checks have been added.
606   BasicBlock *emitSCEVChecks(BasicBlock *Bypass);
607 
608   /// Emit bypass checks to check any memory assumptions we may have made.
609   /// Returns the block containing the checks or nullptr if no checks have been
610   /// added.
611   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass);
612 
613   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
614   /// vector loop preheader, middle block and scalar preheader.
615   void createVectorLoopSkeleton(StringRef Prefix);
616 
617   /// Create new phi nodes for the induction variables to resume iteration count
618   /// in the scalar epilogue, from where the vectorized loop left off.
619   /// In cases where the loop skeleton is more complicated (eg. epilogue
620   /// vectorization) and the resume values can come from an additional bypass
621   /// block, the \p AdditionalBypass pair provides information about the bypass
622   /// block and the end value on the edge from bypass to this loop.
623   void createInductionResumeValues(
624       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
625 
626   /// Complete the loop skeleton by adding debug MDs, creating appropriate
627   /// conditional branches in the middle block, preparing the builder and
628   /// running the verifier. Return the preheader of the completed vector loop.
629   BasicBlock *completeLoopSkeleton(MDNode *OrigLoopID);
630 
631   /// Add additional metadata to \p To that was not present on \p Orig.
632   ///
633   /// Currently this is used to add the noalias annotations based on the
634   /// inserted memchecks.  Use this for instructions that are *cloned* into the
635   /// vector loop.
636   void addNewMetadata(Instruction *To, const Instruction *Orig);
637 
638   /// Collect poison-generating recipes that may generate a poison value that is
639   /// used after vectorization, even when their operands are not poison. Those
640   /// recipes meet the following conditions:
641   ///  * Contribute to the address computation of a recipe generating a widen
642   ///    memory load/store (VPWidenMemoryInstructionRecipe or
643   ///    VPInterleaveRecipe).
644   ///  * Such a widen memory load/store has at least one underlying Instruction
645   ///    that is in a basic block that needs predication and after vectorization
646   ///    the generated instruction won't be predicated.
647   void collectPoisonGeneratingRecipes(VPTransformState &State);
648 
649   /// Allow subclasses to override and print debug traces before/after vplan
650   /// execution, when trace information is requested.
651   virtual void printDebugTracesAtStart(){};
652   virtual void printDebugTracesAtEnd(){};
653 
654   /// The original loop.
655   Loop *OrigLoop;
656 
657   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
658   /// dynamic knowledge to simplify SCEV expressions and converts them to a
659   /// more usable form.
660   PredicatedScalarEvolution &PSE;
661 
662   /// Loop Info.
663   LoopInfo *LI;
664 
665   /// Dominator Tree.
666   DominatorTree *DT;
667 
668   /// Alias Analysis.
669   AAResults *AA;
670 
671   /// Target Library Info.
672   const TargetLibraryInfo *TLI;
673 
674   /// Target Transform Info.
675   const TargetTransformInfo *TTI;
676 
677   /// Assumption Cache.
678   AssumptionCache *AC;
679 
680   /// Interface to emit optimization remarks.
681   OptimizationRemarkEmitter *ORE;
682 
683   /// LoopVersioning.  It's only set up (non-null) if memchecks were
684   /// used.
685   ///
686   /// This is currently only used to add no-alias metadata based on the
687   /// memchecks.  The actually versioning is performed manually.
688   std::unique_ptr<LoopVersioning> LVer;
689 
690   /// The vectorization SIMD factor to use. Each vector will have this many
691   /// vector elements.
692   ElementCount VF;
693 
694   /// The vectorization unroll factor to use. Each scalar is vectorized to this
695   /// many different vector instructions.
696   unsigned UF;
697 
698   /// The builder that we use
699   IRBuilder<> Builder;
700 
701   // --- Vectorization state ---
702 
703   /// The vector-loop preheader.
704   BasicBlock *LoopVectorPreHeader;
705 
706   /// The scalar-loop preheader.
707   BasicBlock *LoopScalarPreHeader;
708 
709   /// Middle Block between the vector and the scalar.
710   BasicBlock *LoopMiddleBlock;
711 
712   /// The unique ExitBlock of the scalar loop if one exists.  Note that
713   /// there can be multiple exiting edges reaching this block.
714   BasicBlock *LoopExitBlock;
715 
716   /// The scalar loop body.
717   BasicBlock *LoopScalarBody;
718 
719   /// A list of all bypass blocks. The first block is the entry of the loop.
720   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
721 
722   /// Store instructions that were predicated.
723   SmallVector<Instruction *, 4> PredicatedInstructions;
724 
725   /// Trip count of the original loop.
726   Value *TripCount = nullptr;
727 
728   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
729   Value *VectorTripCount = nullptr;
730 
731   /// The legality analysis.
732   LoopVectorizationLegality *Legal;
733 
734   /// The profitablity analysis.
735   LoopVectorizationCostModel *Cost;
736 
737   // Record whether runtime checks are added.
738   bool AddedSafetyChecks = false;
739 
740   // Holds the end values for each induction variable. We save the end values
741   // so we can later fix-up the external users of the induction variables.
742   DenseMap<PHINode *, Value *> IVEndValues;
743 
744   /// BFI and PSI are used to check for profile guided size optimizations.
745   BlockFrequencyInfo *BFI;
746   ProfileSummaryInfo *PSI;
747 
748   // Whether this loop should be optimized for size based on profile guided size
749   // optimizatios.
750   bool OptForSizeBasedOnProfile;
751 
752   /// Structure to hold information about generated runtime checks, responsible
753   /// for cleaning the checks, if vectorization turns out unprofitable.
754   GeneratedRTChecks &RTChecks;
755 
756   // Holds the resume values for reductions in the loops, used to set the
757   // correct start value of reduction PHIs when vectorizing the epilogue.
758   SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4>
759       ReductionResumeValues;
760 };
761 
762 class InnerLoopUnroller : public InnerLoopVectorizer {
763 public:
764   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
765                     LoopInfo *LI, DominatorTree *DT,
766                     const TargetLibraryInfo *TLI,
767                     const TargetTransformInfo *TTI, AssumptionCache *AC,
768                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
769                     LoopVectorizationLegality *LVL,
770                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
771                     ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
772       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
773                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
774                             BFI, PSI, Check) {}
775 
776 private:
777   Value *getBroadcastInstrs(Value *V) override;
778 };
779 
780 /// Encapsulate information regarding vectorization of a loop and its epilogue.
781 /// This information is meant to be updated and used across two stages of
782 /// epilogue vectorization.
783 struct EpilogueLoopVectorizationInfo {
784   ElementCount MainLoopVF = ElementCount::getFixed(0);
785   unsigned MainLoopUF = 0;
786   ElementCount EpilogueVF = ElementCount::getFixed(0);
787   unsigned EpilogueUF = 0;
788   BasicBlock *MainLoopIterationCountCheck = nullptr;
789   BasicBlock *EpilogueIterationCountCheck = nullptr;
790   BasicBlock *SCEVSafetyCheck = nullptr;
791   BasicBlock *MemSafetyCheck = nullptr;
792   Value *TripCount = nullptr;
793   Value *VectorTripCount = nullptr;
794 
795   EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
796                                 ElementCount EVF, unsigned EUF)
797       : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
798     assert(EUF == 1 &&
799            "A high UF for the epilogue loop is likely not beneficial.");
800   }
801 };
802 
803 /// An extension of the inner loop vectorizer that creates a skeleton for a
804 /// vectorized loop that has its epilogue (residual) also vectorized.
805 /// The idea is to run the vplan on a given loop twice, firstly to setup the
806 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
807 /// from the first step and vectorize the epilogue.  This is achieved by
808 /// deriving two concrete strategy classes from this base class and invoking
809 /// them in succession from the loop vectorizer planner.
810 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
811 public:
812   InnerLoopAndEpilogueVectorizer(
813       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
814       DominatorTree *DT, const TargetLibraryInfo *TLI,
815       const TargetTransformInfo *TTI, AssumptionCache *AC,
816       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
817       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
818       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
819       GeneratedRTChecks &Checks)
820       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
821                             EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI,
822                             Checks),
823         EPI(EPI) {}
824 
825   // Override this function to handle the more complex control flow around the
826   // three loops.
827   std::pair<BasicBlock *, Value *>
828   createVectorizedLoopSkeleton() final override {
829     return createEpilogueVectorizedLoopSkeleton();
830   }
831 
832   /// The interface for creating a vectorized skeleton using one of two
833   /// different strategies, each corresponding to one execution of the vplan
834   /// as described above.
835   virtual std::pair<BasicBlock *, Value *>
836   createEpilogueVectorizedLoopSkeleton() = 0;
837 
838   /// Holds and updates state information required to vectorize the main loop
839   /// and its epilogue in two separate passes. This setup helps us avoid
840   /// regenerating and recomputing runtime safety checks. It also helps us to
841   /// shorten the iteration-count-check path length for the cases where the
842   /// iteration count of the loop is so small that the main vector loop is
843   /// completely skipped.
844   EpilogueLoopVectorizationInfo &EPI;
845 };
846 
847 /// A specialized derived class of inner loop vectorizer that performs
848 /// vectorization of *main* loops in the process of vectorizing loops and their
849 /// epilogues.
850 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
851 public:
852   EpilogueVectorizerMainLoop(
853       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
854       DominatorTree *DT, const TargetLibraryInfo *TLI,
855       const TargetTransformInfo *TTI, AssumptionCache *AC,
856       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
857       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
858       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
859       GeneratedRTChecks &Check)
860       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
861                                        EPI, LVL, CM, BFI, PSI, Check) {}
862   /// Implements the interface for creating a vectorized skeleton using the
863   /// *main loop* strategy (ie the first pass of vplan execution).
864   std::pair<BasicBlock *, Value *>
865   createEpilogueVectorizedLoopSkeleton() final override;
866 
867 protected:
868   /// Emits an iteration count bypass check once for the main loop (when \p
869   /// ForEpilogue is false) and once for the epilogue loop (when \p
870   /// ForEpilogue is true).
871   BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
872   void printDebugTracesAtStart() override;
873   void printDebugTracesAtEnd() override;
874 };
875 
876 // A specialized derived class of inner loop vectorizer that performs
877 // vectorization of *epilogue* loops in the process of vectorizing loops and
878 // their epilogues.
879 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
880 public:
881   EpilogueVectorizerEpilogueLoop(
882       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
883       DominatorTree *DT, const TargetLibraryInfo *TLI,
884       const TargetTransformInfo *TTI, AssumptionCache *AC,
885       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
886       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
887       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
888       GeneratedRTChecks &Checks)
889       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
890                                        EPI, LVL, CM, BFI, PSI, Checks) {
891     TripCount = EPI.TripCount;
892   }
893   /// Implements the interface for creating a vectorized skeleton using the
894   /// *epilogue loop* strategy (ie the second pass of vplan execution).
895   std::pair<BasicBlock *, Value *>
896   createEpilogueVectorizedLoopSkeleton() final override;
897 
898 protected:
899   /// Emits an iteration count bypass check after the main vector loop has
900   /// finished to see if there are any iterations left to execute by either
901   /// the vector epilogue or the scalar epilogue.
902   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
903                                                       BasicBlock *Bypass,
904                                                       BasicBlock *Insert);
905   void printDebugTracesAtStart() override;
906   void printDebugTracesAtEnd() override;
907 };
908 } // end namespace llvm
909 
910 /// Look for a meaningful debug location on the instruction or it's
911 /// operands.
912 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
913   if (!I)
914     return I;
915 
916   DebugLoc Empty;
917   if (I->getDebugLoc() != Empty)
918     return I;
919 
920   for (Use &Op : I->operands()) {
921     if (Instruction *OpInst = dyn_cast<Instruction>(Op))
922       if (OpInst->getDebugLoc() != Empty)
923         return OpInst;
924   }
925 
926   return I;
927 }
928 
929 void InnerLoopVectorizer::setDebugLocFromInst(
930     const Value *V, Optional<IRBuilderBase *> CustomBuilder) {
931   IRBuilderBase *B = (CustomBuilder == None) ? &Builder : *CustomBuilder;
932   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) {
933     const DILocation *DIL = Inst->getDebugLoc();
934 
935     // When a FSDiscriminator is enabled, we don't need to add the multiply
936     // factors to the discriminators.
937     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
938         !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) {
939       // FIXME: For scalable vectors, assume vscale=1.
940       auto NewDIL =
941           DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
942       if (NewDIL)
943         B->SetCurrentDebugLocation(NewDIL.getValue());
944       else
945         LLVM_DEBUG(dbgs()
946                    << "Failed to create new discriminator: "
947                    << DIL->getFilename() << " Line: " << DIL->getLine());
948     } else
949       B->SetCurrentDebugLocation(DIL);
950   } else
951     B->SetCurrentDebugLocation(DebugLoc());
952 }
953 
954 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
955 /// is passed, the message relates to that particular instruction.
956 #ifndef NDEBUG
957 static void debugVectorizationMessage(const StringRef Prefix,
958                                       const StringRef DebugMsg,
959                                       Instruction *I) {
960   dbgs() << "LV: " << Prefix << DebugMsg;
961   if (I != nullptr)
962     dbgs() << " " << *I;
963   else
964     dbgs() << '.';
965   dbgs() << '\n';
966 }
967 #endif
968 
969 /// Create an analysis remark that explains why vectorization failed
970 ///
971 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
972 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
973 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
974 /// the location of the remark.  \return the remark object that can be
975 /// streamed to.
976 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
977     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
978   Value *CodeRegion = TheLoop->getHeader();
979   DebugLoc DL = TheLoop->getStartLoc();
980 
981   if (I) {
982     CodeRegion = I->getParent();
983     // If there is no debug location attached to the instruction, revert back to
984     // using the loop's.
985     if (I->getDebugLoc())
986       DL = I->getDebugLoc();
987   }
988 
989   return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
990 }
991 
992 namespace llvm {
993 
994 /// Return a value for Step multiplied by VF.
995 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
996                        int64_t Step) {
997   assert(Ty->isIntegerTy() && "Expected an integer step");
998   Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue());
999   return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
1000 }
1001 
1002 /// Return the runtime value for VF.
1003 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
1004   Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
1005   return VF.isScalable() ? B.CreateVScale(EC) : EC;
1006 }
1007 
1008 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
1009                                   ElementCount VF) {
1010   assert(FTy->isFloatingPointTy() && "Expected floating point type!");
1011   Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
1012   Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
1013   return B.CreateUIToFP(RuntimeVF, FTy);
1014 }
1015 
1016 void reportVectorizationFailure(const StringRef DebugMsg,
1017                                 const StringRef OREMsg, const StringRef ORETag,
1018                                 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1019                                 Instruction *I) {
1020   LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
1021   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1022   ORE->emit(
1023       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1024       << "loop not vectorized: " << OREMsg);
1025 }
1026 
1027 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1028                              OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1029                              Instruction *I) {
1030   LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
1031   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1032   ORE->emit(
1033       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1034       << Msg);
1035 }
1036 
1037 } // end namespace llvm
1038 
1039 #ifndef NDEBUG
1040 /// \return string containing a file name and a line # for the given loop.
1041 static std::string getDebugLocString(const Loop *L) {
1042   std::string Result;
1043   if (L) {
1044     raw_string_ostream OS(Result);
1045     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1046       LoopDbgLoc.print(OS);
1047     else
1048       // Just print the module name.
1049       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1050     OS.flush();
1051   }
1052   return Result;
1053 }
1054 #endif
1055 
1056 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
1057                                          const Instruction *Orig) {
1058   // If the loop was versioned with memchecks, add the corresponding no-alias
1059   // metadata.
1060   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
1061     LVer->annotateInstWithNoAlias(To, Orig);
1062 }
1063 
1064 void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
1065     VPTransformState &State) {
1066 
1067   // Collect recipes in the backward slice of `Root` that may generate a poison
1068   // value that is used after vectorization.
1069   SmallPtrSet<VPRecipeBase *, 16> Visited;
1070   auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
1071     SmallVector<VPRecipeBase *, 16> Worklist;
1072     Worklist.push_back(Root);
1073 
1074     // Traverse the backward slice of Root through its use-def chain.
1075     while (!Worklist.empty()) {
1076       VPRecipeBase *CurRec = Worklist.back();
1077       Worklist.pop_back();
1078 
1079       if (!Visited.insert(CurRec).second)
1080         continue;
1081 
1082       // Prune search if we find another recipe generating a widen memory
1083       // instruction. Widen memory instructions involved in address computation
1084       // will lead to gather/scatter instructions, which don't need to be
1085       // handled.
1086       if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
1087           isa<VPInterleaveRecipe>(CurRec) ||
1088           isa<VPScalarIVStepsRecipe>(CurRec) ||
1089           isa<VPCanonicalIVPHIRecipe>(CurRec))
1090         continue;
1091 
1092       // This recipe contributes to the address computation of a widen
1093       // load/store. Collect recipe if its underlying instruction has
1094       // poison-generating flags.
1095       Instruction *Instr = CurRec->getUnderlyingInstr();
1096       if (Instr && Instr->hasPoisonGeneratingFlags())
1097         State.MayGeneratePoisonRecipes.insert(CurRec);
1098 
1099       // Add new definitions to the worklist.
1100       for (VPValue *operand : CurRec->operands())
1101         if (VPDef *OpDef = operand->getDef())
1102           Worklist.push_back(cast<VPRecipeBase>(OpDef));
1103     }
1104   });
1105 
1106   // Traverse all the recipes in the VPlan and collect the poison-generating
1107   // recipes in the backward slice starting at the address of a VPWidenRecipe or
1108   // VPInterleaveRecipe.
1109   auto Iter = depth_first(
1110       VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry()));
1111   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
1112     for (VPRecipeBase &Recipe : *VPBB) {
1113       if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
1114         Instruction &UnderlyingInstr = WidenRec->getIngredient();
1115         VPDef *AddrDef = WidenRec->getAddr()->getDef();
1116         if (AddrDef && WidenRec->isConsecutive() &&
1117             Legal->blockNeedsPredication(UnderlyingInstr.getParent()))
1118           collectPoisonGeneratingInstrsInBackwardSlice(
1119               cast<VPRecipeBase>(AddrDef));
1120       } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
1121         VPDef *AddrDef = InterleaveRec->getAddr()->getDef();
1122         if (AddrDef) {
1123           // Check if any member of the interleave group needs predication.
1124           const InterleaveGroup<Instruction> *InterGroup =
1125               InterleaveRec->getInterleaveGroup();
1126           bool NeedPredication = false;
1127           for (int I = 0, NumMembers = InterGroup->getNumMembers();
1128                I < NumMembers; ++I) {
1129             Instruction *Member = InterGroup->getMember(I);
1130             if (Member)
1131               NeedPredication |=
1132                   Legal->blockNeedsPredication(Member->getParent());
1133           }
1134 
1135           if (NeedPredication)
1136             collectPoisonGeneratingInstrsInBackwardSlice(
1137                 cast<VPRecipeBase>(AddrDef));
1138         }
1139       }
1140     }
1141   }
1142 }
1143 
1144 void InnerLoopVectorizer::addMetadata(Instruction *To,
1145                                       Instruction *From) {
1146   propagateMetadata(To, From);
1147   addNewMetadata(To, From);
1148 }
1149 
1150 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1151                                       Instruction *From) {
1152   for (Value *V : To) {
1153     if (Instruction *I = dyn_cast<Instruction>(V))
1154       addMetadata(I, From);
1155   }
1156 }
1157 
1158 PHINode *InnerLoopVectorizer::getReductionResumeValue(
1159     const RecurrenceDescriptor &RdxDesc) {
1160   auto It = ReductionResumeValues.find(&RdxDesc);
1161   assert(It != ReductionResumeValues.end() &&
1162          "Expected to find a resume value for the reduction.");
1163   return It->second;
1164 }
1165 
1166 namespace llvm {
1167 
1168 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1169 // lowered.
1170 enum ScalarEpilogueLowering {
1171 
1172   // The default: allowing scalar epilogues.
1173   CM_ScalarEpilogueAllowed,
1174 
1175   // Vectorization with OptForSize: don't allow epilogues.
1176   CM_ScalarEpilogueNotAllowedOptSize,
1177 
1178   // A special case of vectorisation with OptForSize: loops with a very small
1179   // trip count are considered for vectorization under OptForSize, thereby
1180   // making sure the cost of their loop body is dominant, free of runtime
1181   // guards and scalar iteration overheads.
1182   CM_ScalarEpilogueNotAllowedLowTripLoop,
1183 
1184   // Loop hint predicate indicating an epilogue is undesired.
1185   CM_ScalarEpilogueNotNeededUsePredicate,
1186 
1187   // Directive indicating we must either tail fold or not vectorize
1188   CM_ScalarEpilogueNotAllowedUsePredicate
1189 };
1190 
1191 /// ElementCountComparator creates a total ordering for ElementCount
1192 /// for the purposes of using it in a set structure.
1193 struct ElementCountComparator {
1194   bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {
1195     return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
1196            std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
1197   }
1198 };
1199 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;
1200 
1201 /// LoopVectorizationCostModel - estimates the expected speedups due to
1202 /// vectorization.
1203 /// In many cases vectorization is not profitable. This can happen because of
1204 /// a number of reasons. In this class we mainly attempt to predict the
1205 /// expected speedup/slowdowns due to the supported instruction set. We use the
1206 /// TargetTransformInfo to query the different backends for the cost of
1207 /// different operations.
1208 class LoopVectorizationCostModel {
1209 public:
1210   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1211                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1212                              LoopVectorizationLegality *Legal,
1213                              const TargetTransformInfo &TTI,
1214                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1215                              AssumptionCache *AC,
1216                              OptimizationRemarkEmitter *ORE, const Function *F,
1217                              const LoopVectorizeHints *Hints,
1218                              InterleavedAccessInfo &IAI)
1219       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1220         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1221         Hints(Hints), InterleaveInfo(IAI) {}
1222 
1223   /// \return An upper bound for the vectorization factors (both fixed and
1224   /// scalable). If the factors are 0, vectorization and interleaving should be
1225   /// avoided up front.
1226   FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1227 
1228   /// \return True if runtime checks are required for vectorization, and false
1229   /// otherwise.
1230   bool runtimeChecksRequired();
1231 
1232   /// \return The most profitable vectorization factor and the cost of that VF.
1233   /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO
1234   /// then this vectorization factor will be selected if vectorization is
1235   /// possible.
1236   VectorizationFactor
1237   selectVectorizationFactor(const ElementCountSet &CandidateVFs);
1238 
1239   VectorizationFactor
1240   selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1241                                     const LoopVectorizationPlanner &LVP);
1242 
1243   /// Setup cost-based decisions for user vectorization factor.
1244   /// \return true if the UserVF is a feasible VF to be chosen.
1245   bool selectUserVectorizationFactor(ElementCount UserVF) {
1246     collectUniformsAndScalars(UserVF);
1247     collectInstsToScalarize(UserVF);
1248     return expectedCost(UserVF).first.isValid();
1249   }
1250 
1251   /// \return The size (in bits) of the smallest and widest types in the code
1252   /// that needs to be vectorized. We ignore values that remain scalar such as
1253   /// 64 bit loop indices.
1254   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1255 
1256   /// \return The desired interleave count.
1257   /// If interleave count has been specified by metadata it will be returned.
1258   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1259   /// are the selected vectorization factor and the cost of the selected VF.
1260   unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1261 
1262   /// Memory access instruction may be vectorized in more than one way.
1263   /// Form of instruction after vectorization depends on cost.
1264   /// This function takes cost-based decisions for Load/Store instructions
1265   /// and collects them in a map. This decisions map is used for building
1266   /// the lists of loop-uniform and loop-scalar instructions.
1267   /// The calculated cost is saved with widening decision in order to
1268   /// avoid redundant calculations.
1269   void setCostBasedWideningDecision(ElementCount VF);
1270 
1271   /// A struct that represents some properties of the register usage
1272   /// of a loop.
1273   struct RegisterUsage {
1274     /// Holds the number of loop invariant values that are used in the loop.
1275     /// The key is ClassID of target-provided register class.
1276     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1277     /// Holds the maximum number of concurrent live intervals in the loop.
1278     /// The key is ClassID of target-provided register class.
1279     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1280   };
1281 
1282   /// \return Returns information about the register usages of the loop for the
1283   /// given vectorization factors.
1284   SmallVector<RegisterUsage, 8>
1285   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1286 
1287   /// Collect values we want to ignore in the cost model.
1288   void collectValuesToIgnore();
1289 
1290   /// Collect all element types in the loop for which widening is needed.
1291   void collectElementTypesForWidening();
1292 
1293   /// Split reductions into those that happen in the loop, and those that happen
1294   /// outside. In loop reductions are collected into InLoopReductionChains.
1295   void collectInLoopReductions();
1296 
1297   /// Returns true if we should use strict in-order reductions for the given
1298   /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1299   /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1300   /// of FP operations.
1301   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1302     return !Hints->allowReordering() && RdxDesc.isOrdered();
1303   }
1304 
1305   /// \returns The smallest bitwidth each instruction can be represented with.
1306   /// The vector equivalents of these instructions should be truncated to this
1307   /// type.
1308   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1309     return MinBWs;
1310   }
1311 
1312   /// \returns True if it is more profitable to scalarize instruction \p I for
1313   /// vectorization factor \p VF.
1314   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1315     assert(VF.isVector() &&
1316            "Profitable to scalarize relevant only for VF > 1.");
1317 
1318     // Cost model is not run in the VPlan-native path - return conservative
1319     // result until this changes.
1320     if (EnableVPlanNativePath)
1321       return false;
1322 
1323     auto Scalars = InstsToScalarize.find(VF);
1324     assert(Scalars != InstsToScalarize.end() &&
1325            "VF not yet analyzed for scalarization profitability");
1326     return Scalars->second.find(I) != Scalars->second.end();
1327   }
1328 
1329   /// Returns true if \p I is known to be uniform after vectorization.
1330   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1331     if (VF.isScalar())
1332       return true;
1333 
1334     // Cost model is not run in the VPlan-native path - return conservative
1335     // result until this changes.
1336     if (EnableVPlanNativePath)
1337       return false;
1338 
1339     auto UniformsPerVF = Uniforms.find(VF);
1340     assert(UniformsPerVF != Uniforms.end() &&
1341            "VF not yet analyzed for uniformity");
1342     return UniformsPerVF->second.count(I);
1343   }
1344 
1345   /// Returns true if \p I is known to be scalar after vectorization.
1346   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1347     if (VF.isScalar())
1348       return true;
1349 
1350     // Cost model is not run in the VPlan-native path - return conservative
1351     // result until this changes.
1352     if (EnableVPlanNativePath)
1353       return false;
1354 
1355     auto ScalarsPerVF = Scalars.find(VF);
1356     assert(ScalarsPerVF != Scalars.end() &&
1357            "Scalar values are not calculated for VF");
1358     return ScalarsPerVF->second.count(I);
1359   }
1360 
1361   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1362   /// for vectorization factor \p VF.
1363   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1364     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1365            !isProfitableToScalarize(I, VF) &&
1366            !isScalarAfterVectorization(I, VF);
1367   }
1368 
1369   /// Decision that was taken during cost calculation for memory instruction.
1370   enum InstWidening {
1371     CM_Unknown,
1372     CM_Widen,         // For consecutive accesses with stride +1.
1373     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1374     CM_Interleave,
1375     CM_GatherScatter,
1376     CM_Scalarize
1377   };
1378 
1379   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1380   /// instruction \p I and vector width \p VF.
1381   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1382                            InstructionCost Cost) {
1383     assert(VF.isVector() && "Expected VF >=2");
1384     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1385   }
1386 
1387   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1388   /// interleaving group \p Grp and vector width \p VF.
1389   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1390                            ElementCount VF, InstWidening W,
1391                            InstructionCost Cost) {
1392     assert(VF.isVector() && "Expected VF >=2");
1393     /// Broadcast this decicion to all instructions inside the group.
1394     /// But the cost will be assigned to one instruction only.
1395     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1396       if (auto *I = Grp->getMember(i)) {
1397         if (Grp->getInsertPos() == I)
1398           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1399         else
1400           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1401       }
1402     }
1403   }
1404 
1405   /// Return the cost model decision for the given instruction \p I and vector
1406   /// width \p VF. Return CM_Unknown if this instruction did not pass
1407   /// through the cost modeling.
1408   InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1409     assert(VF.isVector() && "Expected VF to be a vector VF");
1410     // Cost model is not run in the VPlan-native path - return conservative
1411     // result until this changes.
1412     if (EnableVPlanNativePath)
1413       return CM_GatherScatter;
1414 
1415     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1416     auto Itr = WideningDecisions.find(InstOnVF);
1417     if (Itr == WideningDecisions.end())
1418       return CM_Unknown;
1419     return Itr->second.first;
1420   }
1421 
1422   /// Return the vectorization cost for the given instruction \p I and vector
1423   /// width \p VF.
1424   InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1425     assert(VF.isVector() && "Expected VF >=2");
1426     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1427     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1428            "The cost is not calculated");
1429     return WideningDecisions[InstOnVF].second;
1430   }
1431 
1432   /// Return True if instruction \p I is an optimizable truncate whose operand
1433   /// is an induction variable. Such a truncate will be removed by adding a new
1434   /// induction variable with the destination type.
1435   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1436     // If the instruction is not a truncate, return false.
1437     auto *Trunc = dyn_cast<TruncInst>(I);
1438     if (!Trunc)
1439       return false;
1440 
1441     // Get the source and destination types of the truncate.
1442     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1443     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1444 
1445     // If the truncate is free for the given types, return false. Replacing a
1446     // free truncate with an induction variable would add an induction variable
1447     // update instruction to each iteration of the loop. We exclude from this
1448     // check the primary induction variable since it will need an update
1449     // instruction regardless.
1450     Value *Op = Trunc->getOperand(0);
1451     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1452       return false;
1453 
1454     // If the truncated value is not an induction variable, return false.
1455     return Legal->isInductionPhi(Op);
1456   }
1457 
1458   /// Collects the instructions to scalarize for each predicated instruction in
1459   /// the loop.
1460   void collectInstsToScalarize(ElementCount VF);
1461 
1462   /// Collect Uniform and Scalar values for the given \p VF.
1463   /// The sets depend on CM decision for Load/Store instructions
1464   /// that may be vectorized as interleave, gather-scatter or scalarized.
1465   void collectUniformsAndScalars(ElementCount VF) {
1466     // Do the analysis once.
1467     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1468       return;
1469     setCostBasedWideningDecision(VF);
1470     collectLoopUniforms(VF);
1471     collectLoopScalars(VF);
1472   }
1473 
1474   /// Returns true if the target machine supports masked store operation
1475   /// for the given \p DataType and kind of access to \p Ptr.
1476   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1477     return Legal->isConsecutivePtr(DataType, Ptr) &&
1478            TTI.isLegalMaskedStore(DataType, Alignment);
1479   }
1480 
1481   /// Returns true if the target machine supports masked load operation
1482   /// for the given \p DataType and kind of access to \p Ptr.
1483   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1484     return Legal->isConsecutivePtr(DataType, Ptr) &&
1485            TTI.isLegalMaskedLoad(DataType, Alignment);
1486   }
1487 
1488   /// Returns true if the target machine can represent \p V as a masked gather
1489   /// or scatter operation.
1490   bool isLegalGatherOrScatter(Value *V,
1491                               ElementCount VF = ElementCount::getFixed(1)) {
1492     bool LI = isa<LoadInst>(V);
1493     bool SI = isa<StoreInst>(V);
1494     if (!LI && !SI)
1495       return false;
1496     auto *Ty = getLoadStoreType(V);
1497     Align Align = getLoadStoreAlignment(V);
1498     if (VF.isVector())
1499       Ty = VectorType::get(Ty, VF);
1500     return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1501            (SI && TTI.isLegalMaskedScatter(Ty, Align));
1502   }
1503 
1504   /// Returns true if the target machine supports all of the reduction
1505   /// variables found for the given VF.
1506   bool canVectorizeReductions(ElementCount VF) const {
1507     return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1508       const RecurrenceDescriptor &RdxDesc = Reduction.second;
1509       return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1510     }));
1511   }
1512 
1513   /// Returns true if \p I is an instruction that will be scalarized with
1514   /// predication when vectorizing \p I with vectorization factor \p VF. Such
1515   /// instructions include conditional stores and instructions that may divide
1516   /// by zero.
1517   bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1518 
1519   // Returns true if \p I is an instruction that will be predicated either
1520   // through scalar predication or masked load/store or masked gather/scatter.
1521   // \p VF is the vectorization factor that will be used to vectorize \p I.
1522   // Superset of instructions that return true for isScalarWithPredication.
1523   bool isPredicatedInst(Instruction *I, ElementCount VF,
1524                         bool IsKnownUniform = false) {
1525     // When we know the load is uniform and the original scalar loop was not
1526     // predicated we don't need to mark it as a predicated instruction. Any
1527     // vectorised blocks created when tail-folding are something artificial we
1528     // have introduced and we know there is always at least one active lane.
1529     // That's why we call Legal->blockNeedsPredication here because it doesn't
1530     // query tail-folding.
1531     if (IsKnownUniform && isa<LoadInst>(I) &&
1532         !Legal->blockNeedsPredication(I->getParent()))
1533       return false;
1534     if (!blockNeedsPredicationForAnyReason(I->getParent()))
1535       return false;
1536     // Loads and stores that need some form of masked operation are predicated
1537     // instructions.
1538     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1539       return Legal->isMaskRequired(I);
1540     return isScalarWithPredication(I, VF);
1541   }
1542 
1543   /// Returns true if \p I is a memory instruction with consecutive memory
1544   /// access that can be widened.
1545   bool
1546   memoryInstructionCanBeWidened(Instruction *I,
1547                                 ElementCount VF = ElementCount::getFixed(1));
1548 
1549   /// Returns true if \p I is a memory instruction in an interleaved-group
1550   /// of memory accesses that can be vectorized with wide vector loads/stores
1551   /// and shuffles.
1552   bool
1553   interleavedAccessCanBeWidened(Instruction *I,
1554                                 ElementCount VF = ElementCount::getFixed(1));
1555 
1556   /// Check if \p Instr belongs to any interleaved access group.
1557   bool isAccessInterleaved(Instruction *Instr) {
1558     return InterleaveInfo.isInterleaved(Instr);
1559   }
1560 
1561   /// Get the interleaved access group that \p Instr belongs to.
1562   const InterleaveGroup<Instruction> *
1563   getInterleavedAccessGroup(Instruction *Instr) {
1564     return InterleaveInfo.getInterleaveGroup(Instr);
1565   }
1566 
1567   /// Returns true if we're required to use a scalar epilogue for at least
1568   /// the final iteration of the original loop.
1569   bool requiresScalarEpilogue(ElementCount VF) const {
1570     if (!isScalarEpilogueAllowed())
1571       return false;
1572     // If we might exit from anywhere but the latch, must run the exiting
1573     // iteration in scalar form.
1574     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1575       return true;
1576     return VF.isVector() && InterleaveInfo.requiresScalarEpilogue();
1577   }
1578 
1579   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1580   /// loop hint annotation.
1581   bool isScalarEpilogueAllowed() const {
1582     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1583   }
1584 
1585   /// Returns true if all loop blocks should be masked to fold tail loop.
1586   bool foldTailByMasking() const { return FoldTailByMasking; }
1587 
1588   /// Returns true if the instructions in this block requires predication
1589   /// for any reason, e.g. because tail folding now requires a predicate
1590   /// or because the block in the original loop was predicated.
1591   bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1592     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1593   }
1594 
1595   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1596   /// nodes to the chain of instructions representing the reductions. Uses a
1597   /// MapVector to ensure deterministic iteration order.
1598   using ReductionChainMap =
1599       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1600 
1601   /// Return the chain of instructions representing an inloop reduction.
1602   const ReductionChainMap &getInLoopReductionChains() const {
1603     return InLoopReductionChains;
1604   }
1605 
1606   /// Returns true if the Phi is part of an inloop reduction.
1607   bool isInLoopReduction(PHINode *Phi) const {
1608     return InLoopReductionChains.count(Phi);
1609   }
1610 
1611   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1612   /// with factor VF.  Return the cost of the instruction, including
1613   /// scalarization overhead if it's needed.
1614   InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1615 
1616   /// Estimate cost of a call instruction CI if it were vectorized with factor
1617   /// VF. Return the cost of the instruction, including scalarization overhead
1618   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1619   /// scalarized -
1620   /// i.e. either vector version isn't available, or is too expensive.
1621   InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
1622                                     bool &NeedToScalarize) const;
1623 
1624   /// Returns true if the per-lane cost of VectorizationFactor A is lower than
1625   /// that of B.
1626   bool isMoreProfitable(const VectorizationFactor &A,
1627                         const VectorizationFactor &B) const;
1628 
1629   /// Invalidates decisions already taken by the cost model.
1630   void invalidateCostModelingDecisions() {
1631     WideningDecisions.clear();
1632     Uniforms.clear();
1633     Scalars.clear();
1634   }
1635 
1636 private:
1637   unsigned NumPredStores = 0;
1638 
1639   /// Convenience function that returns the value of vscale_range iff
1640   /// vscale_range.min == vscale_range.max or otherwise returns the value
1641   /// returned by the corresponding TLI method.
1642   Optional<unsigned> getVScaleForTuning() const;
1643 
1644   /// \return An upper bound for the vectorization factors for both
1645   /// fixed and scalable vectorization, where the minimum-known number of
1646   /// elements is a power-of-2 larger than zero. If scalable vectorization is
1647   /// disabled or unsupported, then the scalable part will be equal to
1648   /// ElementCount::getScalable(0).
1649   FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
1650                                            ElementCount UserVF,
1651                                            bool FoldTailByMasking);
1652 
1653   /// \return the maximized element count based on the targets vector
1654   /// registers and the loop trip-count, but limited to a maximum safe VF.
1655   /// This is a helper function of computeFeasibleMaxVF.
1656   ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
1657                                        unsigned SmallestType,
1658                                        unsigned WidestType,
1659                                        ElementCount MaxSafeVF,
1660                                        bool FoldTailByMasking);
1661 
1662   /// \return the maximum legal scalable VF, based on the safe max number
1663   /// of elements.
1664   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1665 
1666   /// The vectorization cost is a combination of the cost itself and a boolean
1667   /// indicating whether any of the contributing operations will actually
1668   /// operate on vector values after type legalization in the backend. If this
1669   /// latter value is false, then all operations will be scalarized (i.e. no
1670   /// vectorization has actually taken place).
1671   using VectorizationCostTy = std::pair<InstructionCost, bool>;
1672 
1673   /// Returns the expected execution cost. The unit of the cost does
1674   /// not matter because we use the 'cost' units to compare different
1675   /// vector widths. The cost that is returned is *not* normalized by
1676   /// the factor width. If \p Invalid is not nullptr, this function
1677   /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1678   /// each instruction that has an Invalid cost for the given VF.
1679   using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1680   VectorizationCostTy
1681   expectedCost(ElementCount VF,
1682                SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
1683 
1684   /// Returns the execution time cost of an instruction for a given vector
1685   /// width. Vector width of one means scalar.
1686   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1687 
1688   /// The cost-computation logic from getInstructionCost which provides
1689   /// the vector type as an output parameter.
1690   InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1691                                      Type *&VectorTy);
1692 
1693   /// Return the cost of instructions in an inloop reduction pattern, if I is
1694   /// part of that pattern.
1695   Optional<InstructionCost>
1696   getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1697                           TTI::TargetCostKind CostKind);
1698 
1699   /// Calculate vectorization cost of memory instruction \p I.
1700   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1701 
1702   /// The cost computation for scalarized memory instruction.
1703   InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1704 
1705   /// The cost computation for interleaving group of memory instructions.
1706   InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1707 
1708   /// The cost computation for Gather/Scatter instruction.
1709   InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1710 
1711   /// The cost computation for widening instruction \p I with consecutive
1712   /// memory access.
1713   InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1714 
1715   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1716   /// Load: scalar load + broadcast.
1717   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1718   /// element)
1719   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1720 
1721   /// Estimate the overhead of scalarizing an instruction. This is a
1722   /// convenience wrapper for the type-based getScalarizationOverhead API.
1723   InstructionCost getScalarizationOverhead(Instruction *I,
1724                                            ElementCount VF) const;
1725 
1726   /// Returns whether the instruction is a load or store and will be a emitted
1727   /// as a vector operation.
1728   bool isConsecutiveLoadOrStore(Instruction *I);
1729 
1730   /// Returns true if an artificially high cost for emulated masked memrefs
1731   /// should be used.
1732   bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1733 
1734   /// Map of scalar integer values to the smallest bitwidth they can be legally
1735   /// represented as. The vector equivalents of these values should be truncated
1736   /// to this type.
1737   MapVector<Instruction *, uint64_t> MinBWs;
1738 
1739   /// A type representing the costs for instructions if they were to be
1740   /// scalarized rather than vectorized. The entries are Instruction-Cost
1741   /// pairs.
1742   using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1743 
1744   /// A set containing all BasicBlocks that are known to present after
1745   /// vectorization as a predicated block.
1746   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1747 
1748   /// Records whether it is allowed to have the original scalar loop execute at
1749   /// least once. This may be needed as a fallback loop in case runtime
1750   /// aliasing/dependence checks fail, or to handle the tail/remainder
1751   /// iterations when the trip count is unknown or doesn't divide by the VF,
1752   /// or as a peel-loop to handle gaps in interleave-groups.
1753   /// Under optsize and when the trip count is very small we don't allow any
1754   /// iterations to execute in the scalar loop.
1755   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1756 
1757   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1758   bool FoldTailByMasking = false;
1759 
1760   /// A map holding scalar costs for different vectorization factors. The
1761   /// presence of a cost for an instruction in the mapping indicates that the
1762   /// instruction will be scalarized when vectorizing with the associated
1763   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1764   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1765 
1766   /// Holds the instructions known to be uniform after vectorization.
1767   /// The data is collected per VF.
1768   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1769 
1770   /// Holds the instructions known to be scalar after vectorization.
1771   /// The data is collected per VF.
1772   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1773 
1774   /// Holds the instructions (address computations) that are forced to be
1775   /// scalarized.
1776   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1777 
1778   /// PHINodes of the reductions that should be expanded in-loop along with
1779   /// their associated chains of reduction operations, in program order from top
1780   /// (PHI) to bottom
1781   ReductionChainMap InLoopReductionChains;
1782 
1783   /// A Map of inloop reduction operations and their immediate chain operand.
1784   /// FIXME: This can be removed once reductions can be costed correctly in
1785   /// vplan. This was added to allow quick lookup to the inloop operations,
1786   /// without having to loop through InLoopReductionChains.
1787   DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1788 
1789   /// Returns the expected difference in cost from scalarizing the expression
1790   /// feeding a predicated instruction \p PredInst. The instructions to
1791   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1792   /// non-negative return value implies the expression will be scalarized.
1793   /// Currently, only single-use chains are considered for scalarization.
1794   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1795                               ElementCount VF);
1796 
1797   /// Collect the instructions that are uniform after vectorization. An
1798   /// instruction is uniform if we represent it with a single scalar value in
1799   /// the vectorized loop corresponding to each vector iteration. Examples of
1800   /// uniform instructions include pointer operands of consecutive or
1801   /// interleaved memory accesses. Note that although uniformity implies an
1802   /// instruction will be scalar, the reverse is not true. In general, a
1803   /// scalarized instruction will be represented by VF scalar values in the
1804   /// vectorized loop, each corresponding to an iteration of the original
1805   /// scalar loop.
1806   void collectLoopUniforms(ElementCount VF);
1807 
1808   /// Collect the instructions that are scalar after vectorization. An
1809   /// instruction is scalar if it is known to be uniform or will be scalarized
1810   /// during vectorization. collectLoopScalars should only add non-uniform nodes
1811   /// to the list if they are used by a load/store instruction that is marked as
1812   /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1813   /// VF values in the vectorized loop, each corresponding to an iteration of
1814   /// the original scalar loop.
1815   void collectLoopScalars(ElementCount VF);
1816 
1817   /// Keeps cost model vectorization decision and cost for instructions.
1818   /// Right now it is used for memory instructions only.
1819   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1820                                 std::pair<InstWidening, InstructionCost>>;
1821 
1822   DecisionList WideningDecisions;
1823 
1824   /// Returns true if \p V is expected to be vectorized and it needs to be
1825   /// extracted.
1826   bool needsExtract(Value *V, ElementCount VF) const {
1827     Instruction *I = dyn_cast<Instruction>(V);
1828     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1829         TheLoop->isLoopInvariant(I))
1830       return false;
1831 
1832     // Assume we can vectorize V (and hence we need extraction) if the
1833     // scalars are not computed yet. This can happen, because it is called
1834     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1835     // the scalars are collected. That should be a safe assumption in most
1836     // cases, because we check if the operands have vectorizable types
1837     // beforehand in LoopVectorizationLegality.
1838     return Scalars.find(VF) == Scalars.end() ||
1839            !isScalarAfterVectorization(I, VF);
1840   };
1841 
1842   /// Returns a range containing only operands needing to be extracted.
1843   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1844                                                    ElementCount VF) const {
1845     return SmallVector<Value *, 4>(make_filter_range(
1846         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1847   }
1848 
1849   /// Determines if we have the infrastructure to vectorize loop \p L and its
1850   /// epilogue, assuming the main loop is vectorized by \p VF.
1851   bool isCandidateForEpilogueVectorization(const Loop &L,
1852                                            const ElementCount VF) const;
1853 
1854   /// Returns true if epilogue vectorization is considered profitable, and
1855   /// false otherwise.
1856   /// \p VF is the vectorization factor chosen for the original loop.
1857   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1858 
1859 public:
1860   /// The loop that we evaluate.
1861   Loop *TheLoop;
1862 
1863   /// Predicated scalar evolution analysis.
1864   PredicatedScalarEvolution &PSE;
1865 
1866   /// Loop Info analysis.
1867   LoopInfo *LI;
1868 
1869   /// Vectorization legality.
1870   LoopVectorizationLegality *Legal;
1871 
1872   /// Vector target information.
1873   const TargetTransformInfo &TTI;
1874 
1875   /// Target Library Info.
1876   const TargetLibraryInfo *TLI;
1877 
1878   /// Demanded bits analysis.
1879   DemandedBits *DB;
1880 
1881   /// Assumption cache.
1882   AssumptionCache *AC;
1883 
1884   /// Interface to emit optimization remarks.
1885   OptimizationRemarkEmitter *ORE;
1886 
1887   const Function *TheFunction;
1888 
1889   /// Loop Vectorize Hint.
1890   const LoopVectorizeHints *Hints;
1891 
1892   /// The interleave access information contains groups of interleaved accesses
1893   /// with the same stride and close to each other.
1894   InterleavedAccessInfo &InterleaveInfo;
1895 
1896   /// Values to ignore in the cost model.
1897   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1898 
1899   /// Values to ignore in the cost model when VF > 1.
1900   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1901 
1902   /// All element types found in the loop.
1903   SmallPtrSet<Type *, 16> ElementTypesInLoop;
1904 
1905   /// Profitable vector factors.
1906   SmallVector<VectorizationFactor, 8> ProfitableVFs;
1907 };
1908 } // end namespace llvm
1909 
1910 /// Helper struct to manage generating runtime checks for vectorization.
1911 ///
1912 /// The runtime checks are created up-front in temporary blocks to allow better
1913 /// estimating the cost and un-linked from the existing IR. After deciding to
1914 /// vectorize, the checks are moved back. If deciding not to vectorize, the
1915 /// temporary blocks are completely removed.
1916 class GeneratedRTChecks {
1917   /// Basic block which contains the generated SCEV checks, if any.
1918   BasicBlock *SCEVCheckBlock = nullptr;
1919 
1920   /// The value representing the result of the generated SCEV checks. If it is
1921   /// nullptr, either no SCEV checks have been generated or they have been used.
1922   Value *SCEVCheckCond = nullptr;
1923 
1924   /// Basic block which contains the generated memory runtime checks, if any.
1925   BasicBlock *MemCheckBlock = nullptr;
1926 
1927   /// The value representing the result of the generated memory runtime checks.
1928   /// If it is nullptr, either no memory runtime checks have been generated or
1929   /// they have been used.
1930   Value *MemRuntimeCheckCond = nullptr;
1931 
1932   DominatorTree *DT;
1933   LoopInfo *LI;
1934 
1935   SCEVExpander SCEVExp;
1936   SCEVExpander MemCheckExp;
1937 
1938 public:
1939   GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1940                     const DataLayout &DL)
1941       : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"),
1942         MemCheckExp(SE, DL, "scev.check") {}
1943 
1944   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1945   /// accurately estimate the cost of the runtime checks. The blocks are
1946   /// un-linked from the IR and is added back during vector code generation. If
1947   /// there is no vector code generation, the check blocks are removed
1948   /// completely.
1949   void Create(Loop *L, const LoopAccessInfo &LAI,
1950               const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1951 
1952     BasicBlock *LoopHeader = L->getHeader();
1953     BasicBlock *Preheader = L->getLoopPreheader();
1954 
1955     // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1956     // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1957     // may be used by SCEVExpander. The blocks will be un-linked from their
1958     // predecessors and removed from LI & DT at the end of the function.
1959     if (!UnionPred.isAlwaysTrue()) {
1960       SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1961                                   nullptr, "vector.scevcheck");
1962 
1963       SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1964           &UnionPred, SCEVCheckBlock->getTerminator());
1965     }
1966 
1967     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1968     if (RtPtrChecking.Need) {
1969       auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1970       MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1971                                  "vector.memcheck");
1972 
1973       auto DiffChecks = RtPtrChecking.getDiffChecks();
1974       if (DiffChecks) {
1975         MemRuntimeCheckCond = addDiffRuntimeChecks(
1976             MemCheckBlock->getTerminator(), L, *DiffChecks, MemCheckExp,
1977             [VF](IRBuilderBase &B, unsigned Bits) {
1978               return getRuntimeVF(B, B.getIntNTy(Bits), VF);
1979             },
1980             IC);
1981       } else {
1982         MemRuntimeCheckCond =
1983             addRuntimeChecks(MemCheckBlock->getTerminator(), L,
1984                              RtPtrChecking.getChecks(), MemCheckExp);
1985       }
1986       assert(MemRuntimeCheckCond &&
1987              "no RT checks generated although RtPtrChecking "
1988              "claimed checks are required");
1989     }
1990 
1991     if (!MemCheckBlock && !SCEVCheckBlock)
1992       return;
1993 
1994     // Unhook the temporary block with the checks, update various places
1995     // accordingly.
1996     if (SCEVCheckBlock)
1997       SCEVCheckBlock->replaceAllUsesWith(Preheader);
1998     if (MemCheckBlock)
1999       MemCheckBlock->replaceAllUsesWith(Preheader);
2000 
2001     if (SCEVCheckBlock) {
2002       SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2003       new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
2004       Preheader->getTerminator()->eraseFromParent();
2005     }
2006     if (MemCheckBlock) {
2007       MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2008       new UnreachableInst(Preheader->getContext(), MemCheckBlock);
2009       Preheader->getTerminator()->eraseFromParent();
2010     }
2011 
2012     DT->changeImmediateDominator(LoopHeader, Preheader);
2013     if (MemCheckBlock) {
2014       DT->eraseNode(MemCheckBlock);
2015       LI->removeBlock(MemCheckBlock);
2016     }
2017     if (SCEVCheckBlock) {
2018       DT->eraseNode(SCEVCheckBlock);
2019       LI->removeBlock(SCEVCheckBlock);
2020     }
2021   }
2022 
2023   /// Remove the created SCEV & memory runtime check blocks & instructions, if
2024   /// unused.
2025   ~GeneratedRTChecks() {
2026     SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2027     SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2028     if (!SCEVCheckCond)
2029       SCEVCleaner.markResultUsed();
2030 
2031     if (!MemRuntimeCheckCond)
2032       MemCheckCleaner.markResultUsed();
2033 
2034     if (MemRuntimeCheckCond) {
2035       auto &SE = *MemCheckExp.getSE();
2036       // Memory runtime check generation creates compares that use expanded
2037       // values. Remove them before running the SCEVExpanderCleaners.
2038       for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2039         if (MemCheckExp.isInsertedInstruction(&I))
2040           continue;
2041         SE.forgetValue(&I);
2042         I.eraseFromParent();
2043       }
2044     }
2045     MemCheckCleaner.cleanup();
2046     SCEVCleaner.cleanup();
2047 
2048     if (SCEVCheckCond)
2049       SCEVCheckBlock->eraseFromParent();
2050     if (MemRuntimeCheckCond)
2051       MemCheckBlock->eraseFromParent();
2052   }
2053 
2054   /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2055   /// adjusts the branches to branch to the vector preheader or \p Bypass,
2056   /// depending on the generated condition.
2057   BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2058                              BasicBlock *LoopVectorPreHeader,
2059                              BasicBlock *LoopExitBlock) {
2060     if (!SCEVCheckCond)
2061       return nullptr;
2062 
2063     Value *Cond = SCEVCheckCond;
2064     // Mark the check as used, to prevent it from being removed during cleanup.
2065     SCEVCheckCond = nullptr;
2066     if (auto *C = dyn_cast<ConstantInt>(Cond))
2067       if (C->isZero())
2068         return nullptr;
2069 
2070     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2071 
2072     BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2073     // Create new preheader for vector loop.
2074     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2075       PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2076 
2077     SCEVCheckBlock->getTerminator()->eraseFromParent();
2078     SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2079     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2080                                                 SCEVCheckBlock);
2081 
2082     DT->addNewBlock(SCEVCheckBlock, Pred);
2083     DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2084 
2085     ReplaceInstWithInst(SCEVCheckBlock->getTerminator(),
2086                         BranchInst::Create(Bypass, LoopVectorPreHeader, Cond));
2087     return SCEVCheckBlock;
2088   }
2089 
2090   /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2091   /// the branches to branch to the vector preheader or \p Bypass, depending on
2092   /// the generated condition.
2093   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2094                                    BasicBlock *LoopVectorPreHeader) {
2095     // Check if we generated code that checks in runtime if arrays overlap.
2096     if (!MemRuntimeCheckCond)
2097       return nullptr;
2098 
2099     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2100     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2101                                                 MemCheckBlock);
2102 
2103     DT->addNewBlock(MemCheckBlock, Pred);
2104     DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2105     MemCheckBlock->moveBefore(LoopVectorPreHeader);
2106 
2107     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2108       PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2109 
2110     ReplaceInstWithInst(
2111         MemCheckBlock->getTerminator(),
2112         BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
2113     MemCheckBlock->getTerminator()->setDebugLoc(
2114         Pred->getTerminator()->getDebugLoc());
2115 
2116     // Mark the check as used, to prevent it from being removed during cleanup.
2117     MemRuntimeCheckCond = nullptr;
2118     return MemCheckBlock;
2119   }
2120 };
2121 
2122 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2123 // vectorization. The loop needs to be annotated with #pragma omp simd
2124 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2125 // vector length information is not provided, vectorization is not considered
2126 // explicit. Interleave hints are not allowed either. These limitations will be
2127 // relaxed in the future.
2128 // Please, note that we are currently forced to abuse the pragma 'clang
2129 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2130 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2131 // provides *explicit vectorization hints* (LV can bypass legal checks and
2132 // assume that vectorization is legal). However, both hints are implemented
2133 // using the same metadata (llvm.loop.vectorize, processed by
2134 // LoopVectorizeHints). This will be fixed in the future when the native IR
2135 // representation for pragma 'omp simd' is introduced.
2136 static bool isExplicitVecOuterLoop(Loop *OuterLp,
2137                                    OptimizationRemarkEmitter *ORE) {
2138   assert(!OuterLp->isInnermost() && "This is not an outer loop");
2139   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2140 
2141   // Only outer loops with an explicit vectorization hint are supported.
2142   // Unannotated outer loops are ignored.
2143   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2144     return false;
2145 
2146   Function *Fn = OuterLp->getHeader()->getParent();
2147   if (!Hints.allowVectorization(Fn, OuterLp,
2148                                 true /*VectorizeOnlyWhenForced*/)) {
2149     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2150     return false;
2151   }
2152 
2153   if (Hints.getInterleave() > 1) {
2154     // TODO: Interleave support is future work.
2155     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2156                          "outer loops.\n");
2157     Hints.emitRemarkWithHints();
2158     return false;
2159   }
2160 
2161   return true;
2162 }
2163 
2164 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2165                                   OptimizationRemarkEmitter *ORE,
2166                                   SmallVectorImpl<Loop *> &V) {
2167   // Collect inner loops and outer loops without irreducible control flow. For
2168   // now, only collect outer loops that have explicit vectorization hints. If we
2169   // are stress testing the VPlan H-CFG construction, we collect the outermost
2170   // loop of every loop nest.
2171   if (L.isInnermost() || VPlanBuildStressTest ||
2172       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2173     LoopBlocksRPO RPOT(&L);
2174     RPOT.perform(LI);
2175     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2176       V.push_back(&L);
2177       // TODO: Collect inner loops inside marked outer loops in case
2178       // vectorization fails for the outer loop. Do not invoke
2179       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2180       // already known to be reducible. We can use an inherited attribute for
2181       // that.
2182       return;
2183     }
2184   }
2185   for (Loop *InnerL : L)
2186     collectSupportedLoops(*InnerL, LI, ORE, V);
2187 }
2188 
2189 namespace {
2190 
2191 /// The LoopVectorize Pass.
2192 struct LoopVectorize : public FunctionPass {
2193   /// Pass identification, replacement for typeid
2194   static char ID;
2195 
2196   LoopVectorizePass Impl;
2197 
2198   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
2199                          bool VectorizeOnlyWhenForced = false)
2200       : FunctionPass(ID),
2201         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
2202     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
2203   }
2204 
2205   bool runOnFunction(Function &F) override {
2206     if (skipFunction(F))
2207       return false;
2208 
2209     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2210     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2211     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2212     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2213     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2214     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2215     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
2216     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2217     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2218     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
2219     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2220     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2221     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
2222 
2223     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
2224         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
2225 
2226     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
2227                         GetLAA, *ORE, PSI).MadeAnyChange;
2228   }
2229 
2230   void getAnalysisUsage(AnalysisUsage &AU) const override {
2231     AU.addRequired<AssumptionCacheTracker>();
2232     AU.addRequired<BlockFrequencyInfoWrapperPass>();
2233     AU.addRequired<DominatorTreeWrapperPass>();
2234     AU.addRequired<LoopInfoWrapperPass>();
2235     AU.addRequired<ScalarEvolutionWrapperPass>();
2236     AU.addRequired<TargetTransformInfoWrapperPass>();
2237     AU.addRequired<AAResultsWrapperPass>();
2238     AU.addRequired<LoopAccessLegacyAnalysis>();
2239     AU.addRequired<DemandedBitsWrapperPass>();
2240     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
2241     AU.addRequired<InjectTLIMappingsLegacy>();
2242 
2243     // We currently do not preserve loopinfo/dominator analyses with outer loop
2244     // vectorization. Until this is addressed, mark these analyses as preserved
2245     // only for non-VPlan-native path.
2246     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
2247     if (!EnableVPlanNativePath) {
2248       AU.addPreserved<LoopInfoWrapperPass>();
2249       AU.addPreserved<DominatorTreeWrapperPass>();
2250     }
2251 
2252     AU.addPreserved<BasicAAWrapperPass>();
2253     AU.addPreserved<GlobalsAAWrapperPass>();
2254     AU.addRequired<ProfileSummaryInfoWrapperPass>();
2255   }
2256 };
2257 
2258 } // end anonymous namespace
2259 
2260 //===----------------------------------------------------------------------===//
2261 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2262 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2263 //===----------------------------------------------------------------------===//
2264 
2265 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2266   // We need to place the broadcast of invariant variables outside the loop,
2267   // but only if it's proven safe to do so. Else, broadcast will be inside
2268   // vector loop body.
2269   Instruction *Instr = dyn_cast<Instruction>(V);
2270   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
2271                      (!Instr ||
2272                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
2273   // Place the code for broadcasting invariant variables in the new preheader.
2274   IRBuilder<>::InsertPointGuard Guard(Builder);
2275   if (SafeToHoist)
2276     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2277 
2278   // Broadcast the scalar into all locations in the vector.
2279   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2280 
2281   return Shuf;
2282 }
2283 
2284 /// This function adds
2285 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
2286 /// to each vector element of Val. The sequence starts at StartIndex.
2287 /// \p Opcode is relevant for FP induction variable.
2288 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
2289                             Instruction::BinaryOps BinOp, ElementCount VF,
2290                             IRBuilderBase &Builder) {
2291   assert(VF.isVector() && "only vector VFs are supported");
2292 
2293   // Create and check the types.
2294   auto *ValVTy = cast<VectorType>(Val->getType());
2295   ElementCount VLen = ValVTy->getElementCount();
2296 
2297   Type *STy = Val->getType()->getScalarType();
2298   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2299          "Induction Step must be an integer or FP");
2300   assert(Step->getType() == STy && "Step has wrong type");
2301 
2302   SmallVector<Constant *, 8> Indices;
2303 
2304   // Create a vector of consecutive numbers from zero to VF.
2305   VectorType *InitVecValVTy = ValVTy;
2306   if (STy->isFloatingPointTy()) {
2307     Type *InitVecValSTy =
2308         IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
2309     InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
2310   }
2311   Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
2312 
2313   // Splat the StartIdx
2314   Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
2315 
2316   if (STy->isIntegerTy()) {
2317     InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
2318     Step = Builder.CreateVectorSplat(VLen, Step);
2319     assert(Step->getType() == Val->getType() && "Invalid step vec");
2320     // FIXME: The newly created binary instructions should contain nsw/nuw
2321     // flags, which can be found from the original scalar operations.
2322     Step = Builder.CreateMul(InitVec, Step);
2323     return Builder.CreateAdd(Val, Step, "induction");
2324   }
2325 
2326   // Floating point induction.
2327   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2328          "Binary Opcode should be specified for FP induction");
2329   InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
2330   InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
2331 
2332   Step = Builder.CreateVectorSplat(VLen, Step);
2333   Value *MulOp = Builder.CreateFMul(InitVec, Step);
2334   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2335 }
2336 
2337 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
2338 /// variable on which to base the steps, \p Step is the size of the step.
2339 static void buildScalarSteps(Value *ScalarIV, Value *Step,
2340                              const InductionDescriptor &ID, VPValue *Def,
2341                              VPTransformState &State) {
2342   IRBuilderBase &Builder = State.Builder;
2343   // We shouldn't have to build scalar steps if we aren't vectorizing.
2344   assert(State.VF.isVector() && "VF should be greater than one");
2345   // Get the value type and ensure it and the step have the same integer type.
2346   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2347   assert(ScalarIVTy == Step->getType() &&
2348          "Val and Step should have the same type");
2349 
2350   // We build scalar steps for both integer and floating-point induction
2351   // variables. Here, we determine the kind of arithmetic we will perform.
2352   Instruction::BinaryOps AddOp;
2353   Instruction::BinaryOps MulOp;
2354   if (ScalarIVTy->isIntegerTy()) {
2355     AddOp = Instruction::Add;
2356     MulOp = Instruction::Mul;
2357   } else {
2358     AddOp = ID.getInductionOpcode();
2359     MulOp = Instruction::FMul;
2360   }
2361 
2362   // Determine the number of scalars we need to generate for each unroll
2363   // iteration.
2364   bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def);
2365   unsigned Lanes = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
2366   // Compute the scalar steps and save the results in State.
2367   Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2368                                      ScalarIVTy->getScalarSizeInBits());
2369   Type *VecIVTy = nullptr;
2370   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2371   if (!FirstLaneOnly && State.VF.isScalable()) {
2372     VecIVTy = VectorType::get(ScalarIVTy, State.VF);
2373     UnitStepVec =
2374         Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
2375     SplatStep = Builder.CreateVectorSplat(State.VF, Step);
2376     SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV);
2377   }
2378 
2379   for (unsigned Part = 0; Part < State.UF; ++Part) {
2380     Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
2381 
2382     if (!FirstLaneOnly && State.VF.isScalable()) {
2383       auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
2384       auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2385       if (ScalarIVTy->isFloatingPointTy())
2386         InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2387       auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2388       auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2389       State.set(Def, Add, Part);
2390       // It's useful to record the lane values too for the known minimum number
2391       // of elements so we do those below. This improves the code quality when
2392       // trying to extract the first element, for example.
2393     }
2394 
2395     if (ScalarIVTy->isFloatingPointTy())
2396       StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);
2397 
2398     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2399       Value *StartIdx = Builder.CreateBinOp(
2400           AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
2401       // The step returned by `createStepForVF` is a runtime-evaluated value
2402       // when VF is scalable. Otherwise, it should be folded into a Constant.
2403       assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
2404              "Expected StartIdx to be folded to a constant when VF is not "
2405              "scalable");
2406       auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2407       auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
2408       State.set(Def, Add, VPIteration(Part, Lane));
2409     }
2410   }
2411 }
2412 
2413 // Generate code for the induction step. Note that induction steps are
2414 // required to be loop-invariant
2415 static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE,
2416                               Instruction *InsertBefore,
2417                               Loop *OrigLoop = nullptr) {
2418   const DataLayout &DL = SE.getDataLayout();
2419   assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) &&
2420          "Induction step should be loop invariant");
2421   if (auto *E = dyn_cast<SCEVUnknown>(Step))
2422     return E->getValue();
2423 
2424   SCEVExpander Exp(SE, DL, "induction");
2425   return Exp.expandCodeFor(Step, Step->getType(), InsertBefore);
2426 }
2427 
2428 /// Compute the transformed value of Index at offset StartValue using step
2429 /// StepValue.
2430 /// For integer induction, returns StartValue + Index * StepValue.
2431 /// For pointer induction, returns StartValue[Index * StepValue].
2432 /// FIXME: The newly created binary instructions should contain nsw/nuw
2433 /// flags, which can be found from the original scalar operations.
2434 static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index,
2435                                    Value *StartValue, Value *Step,
2436                                    const InductionDescriptor &ID) {
2437   assert(Index->getType()->getScalarType() == Step->getType() &&
2438          "Index scalar type does not match StepValue type");
2439 
2440   // Note: the IR at this point is broken. We cannot use SE to create any new
2441   // SCEV and then expand it, hoping that SCEV's simplification will give us
2442   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2443   // lead to various SCEV crashes. So all we can do is to use builder and rely
2444   // on InstCombine for future simplifications. Here we handle some trivial
2445   // cases only.
2446   auto CreateAdd = [&B](Value *X, Value *Y) {
2447     assert(X->getType() == Y->getType() && "Types don't match!");
2448     if (auto *CX = dyn_cast<ConstantInt>(X))
2449       if (CX->isZero())
2450         return Y;
2451     if (auto *CY = dyn_cast<ConstantInt>(Y))
2452       if (CY->isZero())
2453         return X;
2454     return B.CreateAdd(X, Y);
2455   };
2456 
2457   // We allow X to be a vector type, in which case Y will potentially be
2458   // splatted into a vector with the same element count.
2459   auto CreateMul = [&B](Value *X, Value *Y) {
2460     assert(X->getType()->getScalarType() == Y->getType() &&
2461            "Types don't match!");
2462     if (auto *CX = dyn_cast<ConstantInt>(X))
2463       if (CX->isOne())
2464         return Y;
2465     if (auto *CY = dyn_cast<ConstantInt>(Y))
2466       if (CY->isOne())
2467         return X;
2468     VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2469     if (XVTy && !isa<VectorType>(Y->getType()))
2470       Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2471     return B.CreateMul(X, Y);
2472   };
2473 
2474   switch (ID.getKind()) {
2475   case InductionDescriptor::IK_IntInduction: {
2476     assert(!isa<VectorType>(Index->getType()) &&
2477            "Vector indices not supported for integer inductions yet");
2478     assert(Index->getType() == StartValue->getType() &&
2479            "Index type does not match StartValue type");
2480     if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2481       return B.CreateSub(StartValue, Index);
2482     auto *Offset = CreateMul(Index, Step);
2483     return CreateAdd(StartValue, Offset);
2484   }
2485   case InductionDescriptor::IK_PtrInduction: {
2486     assert(isa<Constant>(Step) &&
2487            "Expected constant step for pointer induction");
2488     return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step));
2489   }
2490   case InductionDescriptor::IK_FpInduction: {
2491     assert(!isa<VectorType>(Index->getType()) &&
2492            "Vector indices not supported for FP inductions yet");
2493     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2494     auto InductionBinOp = ID.getInductionBinOp();
2495     assert(InductionBinOp &&
2496            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2497             InductionBinOp->getOpcode() == Instruction::FSub) &&
2498            "Original bin op should be defined for FP induction");
2499 
2500     Value *MulExp = B.CreateFMul(Step, Index);
2501     return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2502                          "induction");
2503   }
2504   case InductionDescriptor::IK_NoInduction:
2505     return nullptr;
2506   }
2507   llvm_unreachable("invalid enum");
2508 }
2509 
2510 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
2511                                                     const VPIteration &Instance,
2512                                                     VPTransformState &State) {
2513   Value *ScalarInst = State.get(Def, Instance);
2514   Value *VectorValue = State.get(Def, Instance.Part);
2515   VectorValue = Builder.CreateInsertElement(
2516       VectorValue, ScalarInst,
2517       Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
2518   State.set(Def, VectorValue, Instance.Part);
2519 }
2520 
2521 // Return whether we allow using masked interleave-groups (for dealing with
2522 // strided loads/stores that reside in predicated blocks, or for dealing
2523 // with gaps).
2524 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2525   // If an override option has been passed in for interleaved accesses, use it.
2526   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2527     return EnableMaskedInterleavedMemAccesses;
2528 
2529   return TTI.enableMaskedInterleavedAccessVectorization();
2530 }
2531 
2532 // Try to vectorize the interleave group that \p Instr belongs to.
2533 //
2534 // E.g. Translate following interleaved load group (factor = 3):
2535 //   for (i = 0; i < N; i+=3) {
2536 //     R = Pic[i];             // Member of index 0
2537 //     G = Pic[i+1];           // Member of index 1
2538 //     B = Pic[i+2];           // Member of index 2
2539 //     ... // do something to R, G, B
2540 //   }
2541 // To:
2542 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2543 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2544 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2545 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
2546 //
2547 // Or translate following interleaved store group (factor = 3):
2548 //   for (i = 0; i < N; i+=3) {
2549 //     ... do something to R, G, B
2550 //     Pic[i]   = R;           // Member of index 0
2551 //     Pic[i+1] = G;           // Member of index 1
2552 //     Pic[i+2] = B;           // Member of index 2
2553 //   }
2554 // To:
2555 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2556 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2557 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2558 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2559 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2560 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2561     const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2562     VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2563     VPValue *BlockInMask) {
2564   Instruction *Instr = Group->getInsertPos();
2565   const DataLayout &DL = Instr->getModule()->getDataLayout();
2566 
2567   // Prepare for the vector type of the interleaved load/store.
2568   Type *ScalarTy = getLoadStoreType(Instr);
2569   unsigned InterleaveFactor = Group->getFactor();
2570   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2571   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2572 
2573   // Prepare for the new pointers.
2574   SmallVector<Value *, 2> AddrParts;
2575   unsigned Index = Group->getIndex(Instr);
2576 
2577   // TODO: extend the masked interleaved-group support to reversed access.
2578   assert((!BlockInMask || !Group->isReverse()) &&
2579          "Reversed masked interleave-group not supported.");
2580 
2581   // If the group is reverse, adjust the index to refer to the last vector lane
2582   // instead of the first. We adjust the index from the first vector lane,
2583   // rather than directly getting the pointer for lane VF - 1, because the
2584   // pointer operand of the interleaved access is supposed to be uniform. For
2585   // uniform instructions, we're only required to generate a value for the
2586   // first vector lane in each unroll iteration.
2587   if (Group->isReverse())
2588     Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2589 
2590   for (unsigned Part = 0; Part < UF; Part++) {
2591     Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2592     setDebugLocFromInst(AddrPart);
2593 
2594     // Notice current instruction could be any index. Need to adjust the address
2595     // to the member of index 0.
2596     //
2597     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2598     //       b = A[i];       // Member of index 0
2599     // Current pointer is pointed to A[i+1], adjust it to A[i].
2600     //
2601     // E.g.  A[i+1] = a;     // Member of index 1
2602     //       A[i]   = b;     // Member of index 0
2603     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2604     // Current pointer is pointed to A[i+2], adjust it to A[i].
2605 
2606     bool InBounds = false;
2607     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2608       InBounds = gep->isInBounds();
2609     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2610     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2611 
2612     // Cast to the vector pointer type.
2613     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2614     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2615     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2616   }
2617 
2618   setDebugLocFromInst(Instr);
2619   Value *PoisonVec = PoisonValue::get(VecTy);
2620 
2621   Value *MaskForGaps = nullptr;
2622   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2623     MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2624     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2625   }
2626 
2627   // Vectorize the interleaved load group.
2628   if (isa<LoadInst>(Instr)) {
2629     // For each unroll part, create a wide load for the group.
2630     SmallVector<Value *, 2> NewLoads;
2631     for (unsigned Part = 0; Part < UF; Part++) {
2632       Instruction *NewLoad;
2633       if (BlockInMask || MaskForGaps) {
2634         assert(useMaskedInterleavedAccesses(*TTI) &&
2635                "masked interleaved groups are not allowed.");
2636         Value *GroupMask = MaskForGaps;
2637         if (BlockInMask) {
2638           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2639           Value *ShuffledMask = Builder.CreateShuffleVector(
2640               BlockInMaskPart,
2641               createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2642               "interleaved.mask");
2643           GroupMask = MaskForGaps
2644                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2645                                                 MaskForGaps)
2646                           : ShuffledMask;
2647         }
2648         NewLoad =
2649             Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2650                                      GroupMask, PoisonVec, "wide.masked.vec");
2651       }
2652       else
2653         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2654                                             Group->getAlign(), "wide.vec");
2655       Group->addMetadata(NewLoad);
2656       NewLoads.push_back(NewLoad);
2657     }
2658 
2659     // For each member in the group, shuffle out the appropriate data from the
2660     // wide loads.
2661     unsigned J = 0;
2662     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2663       Instruction *Member = Group->getMember(I);
2664 
2665       // Skip the gaps in the group.
2666       if (!Member)
2667         continue;
2668 
2669       auto StrideMask =
2670           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2671       for (unsigned Part = 0; Part < UF; Part++) {
2672         Value *StridedVec = Builder.CreateShuffleVector(
2673             NewLoads[Part], StrideMask, "strided.vec");
2674 
2675         // If this member has different type, cast the result type.
2676         if (Member->getType() != ScalarTy) {
2677           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2678           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2679           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2680         }
2681 
2682         if (Group->isReverse())
2683           StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2684 
2685         State.set(VPDefs[J], StridedVec, Part);
2686       }
2687       ++J;
2688     }
2689     return;
2690   }
2691 
2692   // The sub vector type for current instruction.
2693   auto *SubVT = VectorType::get(ScalarTy, VF);
2694 
2695   // Vectorize the interleaved store group.
2696   MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2697   assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
2698          "masked interleaved groups are not allowed.");
2699   assert((!MaskForGaps || !VF.isScalable()) &&
2700          "masking gaps for scalable vectors is not yet supported.");
2701   for (unsigned Part = 0; Part < UF; Part++) {
2702     // Collect the stored vector from each member.
2703     SmallVector<Value *, 4> StoredVecs;
2704     for (unsigned i = 0; i < InterleaveFactor; i++) {
2705       assert((Group->getMember(i) || MaskForGaps) &&
2706              "Fail to get a member from an interleaved store group");
2707       Instruction *Member = Group->getMember(i);
2708 
2709       // Skip the gaps in the group.
2710       if (!Member) {
2711         Value *Undef = PoisonValue::get(SubVT);
2712         StoredVecs.push_back(Undef);
2713         continue;
2714       }
2715 
2716       Value *StoredVec = State.get(StoredValues[i], Part);
2717 
2718       if (Group->isReverse())
2719         StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");
2720 
2721       // If this member has different type, cast it to a unified type.
2722 
2723       if (StoredVec->getType() != SubVT)
2724         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2725 
2726       StoredVecs.push_back(StoredVec);
2727     }
2728 
2729     // Concatenate all vectors into a wide vector.
2730     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2731 
2732     // Interleave the elements in the wide vector.
2733     Value *IVec = Builder.CreateShuffleVector(
2734         WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2735         "interleaved.vec");
2736 
2737     Instruction *NewStoreInstr;
2738     if (BlockInMask || MaskForGaps) {
2739       Value *GroupMask = MaskForGaps;
2740       if (BlockInMask) {
2741         Value *BlockInMaskPart = State.get(BlockInMask, Part);
2742         Value *ShuffledMask = Builder.CreateShuffleVector(
2743             BlockInMaskPart,
2744             createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2745             "interleaved.mask");
2746         GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And,
2747                                                       ShuffledMask, MaskForGaps)
2748                                 : ShuffledMask;
2749       }
2750       NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2751                                                 Group->getAlign(), GroupMask);
2752     } else
2753       NewStoreInstr =
2754           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2755 
2756     Group->addMetadata(NewStoreInstr);
2757   }
2758 }
2759 
2760 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2761                                                VPReplicateRecipe *RepRecipe,
2762                                                const VPIteration &Instance,
2763                                                bool IfPredicateInstr,
2764                                                VPTransformState &State) {
2765   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2766 
2767   // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2768   // the first lane and part.
2769   if (isa<NoAliasScopeDeclInst>(Instr))
2770     if (!Instance.isFirstIteration())
2771       return;
2772 
2773   // Does this instruction return a value ?
2774   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2775 
2776   Instruction *Cloned = Instr->clone();
2777   if (!IsVoidRetTy)
2778     Cloned->setName(Instr->getName() + ".cloned");
2779 
2780   // If the scalarized instruction contributes to the address computation of a
2781   // widen masked load/store which was in a basic block that needed predication
2782   // and is not predicated after vectorization, we can't propagate
2783   // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized
2784   // instruction could feed a poison value to the base address of the widen
2785   // load/store.
2786   if (State.MayGeneratePoisonRecipes.contains(RepRecipe))
2787     Cloned->dropPoisonGeneratingFlags();
2788 
2789   if (Instr->getDebugLoc())
2790     setDebugLocFromInst(Instr);
2791 
2792   // Replace the operands of the cloned instructions with their scalar
2793   // equivalents in the new loop.
2794   for (auto &I : enumerate(RepRecipe->operands())) {
2795     auto InputInstance = Instance;
2796     VPValue *Operand = I.value();
2797     VPReplicateRecipe *OperandR = dyn_cast<VPReplicateRecipe>(Operand);
2798     if (OperandR && OperandR->isUniform())
2799       InputInstance.Lane = VPLane::getFirstLane();
2800     Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2801   }
2802   addNewMetadata(Cloned, Instr);
2803 
2804   // Place the cloned scalar in the new loop.
2805   State.Builder.Insert(Cloned);
2806 
2807   State.set(RepRecipe, Cloned, Instance);
2808 
2809   // If we just cloned a new assumption, add it the assumption cache.
2810   if (auto *II = dyn_cast<AssumeInst>(Cloned))
2811     AC->registerAssumption(II);
2812 
2813   // End if-block.
2814   if (IfPredicateInstr)
2815     PredicatedInstructions.push_back(Cloned);
2816 }
2817 
2818 Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) {
2819   if (TripCount)
2820     return TripCount;
2821 
2822   assert(InsertBlock);
2823   IRBuilder<> Builder(InsertBlock->getTerminator());
2824   // Find the loop boundaries.
2825   ScalarEvolution *SE = PSE.getSE();
2826   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2827   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
2828          "Invalid loop count");
2829 
2830   Type *IdxTy = Legal->getWidestInductionType();
2831   assert(IdxTy && "No type for induction");
2832 
2833   // The exit count might have the type of i64 while the phi is i32. This can
2834   // happen if we have an induction variable that is sign extended before the
2835   // compare. The only way that we get a backedge taken count is that the
2836   // induction variable was signed and as such will not overflow. In such a case
2837   // truncation is legal.
2838   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
2839       IdxTy->getPrimitiveSizeInBits())
2840     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2841   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2842 
2843   // Get the total trip count from the count by adding 1.
2844   const SCEV *ExitCount = SE->getAddExpr(
2845       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2846 
2847   const DataLayout &DL = InsertBlock->getModule()->getDataLayout();
2848 
2849   // Expand the trip count and place the new instructions in the preheader.
2850   // Notice that the pre-header does not change, only the loop body.
2851   SCEVExpander Exp(*SE, DL, "induction");
2852 
2853   // Count holds the overall loop count (N).
2854   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2855                                 InsertBlock->getTerminator());
2856 
2857   if (TripCount->getType()->isPointerTy())
2858     TripCount =
2859         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2860                                     InsertBlock->getTerminator());
2861 
2862   return TripCount;
2863 }
2864 
2865 Value *
2866 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
2867   if (VectorTripCount)
2868     return VectorTripCount;
2869 
2870   Value *TC = getOrCreateTripCount(InsertBlock);
2871   IRBuilder<> Builder(InsertBlock->getTerminator());
2872 
2873   Type *Ty = TC->getType();
2874   // This is where we can make the step a runtime constant.
2875   Value *Step = createStepForVF(Builder, Ty, VF, UF);
2876 
2877   // If the tail is to be folded by masking, round the number of iterations N
2878   // up to a multiple of Step instead of rounding down. This is done by first
2879   // adding Step-1 and then rounding down. Note that it's ok if this addition
2880   // overflows: the vector induction variable will eventually wrap to zero given
2881   // that it starts at zero and its Step is a power of two; the loop will then
2882   // exit, with the last early-exit vector comparison also producing all-true.
2883   // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2884   // is accounted for in emitIterationCountCheck that adds an overflow check.
2885   if (Cost->foldTailByMasking()) {
2886     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2887            "VF*UF must be a power of 2 when folding tail by masking");
2888     Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF);
2889     TC = Builder.CreateAdd(
2890         TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up");
2891   }
2892 
2893   // Now we need to generate the expression for the part of the loop that the
2894   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2895   // iterations are not required for correctness, or N - Step, otherwise. Step
2896   // is equal to the vectorization factor (number of SIMD elements) times the
2897   // unroll factor (number of SIMD instructions).
2898   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2899 
2900   // There are cases where we *must* run at least one iteration in the remainder
2901   // loop.  See the cost model for when this can happen.  If the step evenly
2902   // divides the trip count, we set the remainder to be equal to the step. If
2903   // the step does not evenly divide the trip count, no adjustment is necessary
2904   // since there will already be scalar iterations. Note that the minimum
2905   // iterations check ensures that N >= Step.
2906   if (Cost->requiresScalarEpilogue(VF)) {
2907     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2908     R = Builder.CreateSelect(IsZero, Step, R);
2909   }
2910 
2911   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2912 
2913   return VectorTripCount;
2914 }
2915 
2916 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2917                                                    const DataLayout &DL) {
2918   // Verify that V is a vector type with same number of elements as DstVTy.
2919   auto *DstFVTy = cast<FixedVectorType>(DstVTy);
2920   unsigned VF = DstFVTy->getNumElements();
2921   auto *SrcVecTy = cast<FixedVectorType>(V->getType());
2922   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2923   Type *SrcElemTy = SrcVecTy->getElementType();
2924   Type *DstElemTy = DstFVTy->getElementType();
2925   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2926          "Vector elements must have same size");
2927 
2928   // Do a direct cast if element types are castable.
2929   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2930     return Builder.CreateBitOrPointerCast(V, DstFVTy);
2931   }
2932   // V cannot be directly casted to desired vector type.
2933   // May happen when V is a floating point vector but DstVTy is a vector of
2934   // pointers or vice-versa. Handle this using a two-step bitcast using an
2935   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2936   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2937          "Only one type should be a pointer type");
2938   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2939          "Only one type should be a floating point type");
2940   Type *IntTy =
2941       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2942   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
2943   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2944   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
2945 }
2946 
2947 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
2948   Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
2949   // Reuse existing vector loop preheader for TC checks.
2950   // Note that new preheader block is generated for vector loop.
2951   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2952   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2953 
2954   // Generate code to check if the loop's trip count is less than VF * UF, or
2955   // equal to it in case a scalar epilogue is required; this implies that the
2956   // vector trip count is zero. This check also covers the case where adding one
2957   // to the backedge-taken count overflowed leading to an incorrect trip count
2958   // of zero. In this case we will also jump to the scalar loop.
2959   auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE
2960                                             : ICmpInst::ICMP_ULT;
2961 
2962   // If tail is to be folded, vector loop takes care of all iterations.
2963   Type *CountTy = Count->getType();
2964   Value *CheckMinIters = Builder.getFalse();
2965   Value *Step = createStepForVF(Builder, CountTy, VF, UF);
2966   if (!Cost->foldTailByMasking())
2967     CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
2968   else if (VF.isScalable()) {
2969     // vscale is not necessarily a power-of-2, which means we cannot guarantee
2970     // an overflow to zero when updating induction variables and so an
2971     // additional overflow check is required before entering the vector loop.
2972 
2973     // Get the maximum unsigned value for the type.
2974     Value *MaxUIntTripCount =
2975         ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2976     Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
2977 
2978     // Don't execute the vector loop if (UMax - n) < (VF * UF).
2979     CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, Step);
2980   }
2981   // Create new preheader for vector loop.
2982   LoopVectorPreHeader =
2983       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2984                  "vector.ph");
2985 
2986   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2987                                DT->getNode(Bypass)->getIDom()) &&
2988          "TC check is expected to dominate Bypass");
2989 
2990   // Update dominator for Bypass & LoopExit (if needed).
2991   DT->changeImmediateDominator(Bypass, TCCheckBlock);
2992   if (!Cost->requiresScalarEpilogue(VF))
2993     // If there is an epilogue which must run, there's no edge from the
2994     // middle block to exit blocks  and thus no need to update the immediate
2995     // dominator of the exit blocks.
2996     DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
2997 
2998   ReplaceInstWithInst(
2999       TCCheckBlock->getTerminator(),
3000       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3001   LoopBypassBlocks.push_back(TCCheckBlock);
3002 }
3003 
3004 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
3005 
3006   BasicBlock *const SCEVCheckBlock =
3007       RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
3008   if (!SCEVCheckBlock)
3009     return nullptr;
3010 
3011   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
3012            (OptForSizeBasedOnProfile &&
3013             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
3014          "Cannot SCEV check stride or overflow when optimizing for size");
3015 
3016 
3017   // Update dominator only if this is first RT check.
3018   if (LoopBypassBlocks.empty()) {
3019     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3020     if (!Cost->requiresScalarEpilogue(VF))
3021       // If there is an epilogue which must run, there's no edge from the
3022       // middle block to exit blocks  and thus no need to update the immediate
3023       // dominator of the exit blocks.
3024       DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3025   }
3026 
3027   LoopBypassBlocks.push_back(SCEVCheckBlock);
3028   AddedSafetyChecks = true;
3029   return SCEVCheckBlock;
3030 }
3031 
3032 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
3033   // VPlan-native path does not do any analysis for runtime checks currently.
3034   if (EnableVPlanNativePath)
3035     return nullptr;
3036 
3037   BasicBlock *const MemCheckBlock =
3038       RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
3039 
3040   // Check if we generated code that checks in runtime if arrays overlap. We put
3041   // the checks into a separate block to make the more common case of few
3042   // elements faster.
3043   if (!MemCheckBlock)
3044     return nullptr;
3045 
3046   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3047     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
3048            "Cannot emit memory checks when optimizing for size, unless forced "
3049            "to vectorize.");
3050     ORE->emit([&]() {
3051       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
3052                                         OrigLoop->getStartLoc(),
3053                                         OrigLoop->getHeader())
3054              << "Code-size may be reduced by not forcing "
3055                 "vectorization, or by source-code modifications "
3056                 "eliminating the need for runtime checks "
3057                 "(e.g., adding 'restrict').";
3058     });
3059   }
3060 
3061   LoopBypassBlocks.push_back(MemCheckBlock);
3062 
3063   AddedSafetyChecks = true;
3064 
3065   // Only use noalias metadata when using memory checks guaranteeing no overlap
3066   // across all iterations.
3067   if (!Legal->getLAI()->getRuntimePointerChecking()->getDiffChecks()) {
3068     //  We currently don't use LoopVersioning for the actual loop cloning but we
3069     //  still use it to add the noalias metadata.
3070     LVer = std::make_unique<LoopVersioning>(
3071         *Legal->getLAI(),
3072         Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
3073         DT, PSE.getSE());
3074     LVer->prepareNoAliasMetadata();
3075   }
3076   return MemCheckBlock;
3077 }
3078 
3079 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3080   LoopScalarBody = OrigLoop->getHeader();
3081   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3082   assert(LoopVectorPreHeader && "Invalid loop structure");
3083   LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
3084   assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) &&
3085          "multiple exit loop without required epilogue?");
3086 
3087   LoopMiddleBlock =
3088       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3089                  LI, nullptr, Twine(Prefix) + "middle.block");
3090   LoopScalarPreHeader =
3091       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3092                  nullptr, Twine(Prefix) + "scalar.ph");
3093 
3094   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3095 
3096   // Set up the middle block terminator.  Two cases:
3097   // 1) If we know that we must execute the scalar epilogue, emit an
3098   //    unconditional branch.
3099   // 2) Otherwise, we must have a single unique exit block (due to how we
3100   //    implement the multiple exit case).  In this case, set up a conditonal
3101   //    branch from the middle block to the loop scalar preheader, and the
3102   //    exit block.  completeLoopSkeleton will update the condition to use an
3103   //    iteration check, if required to decide whether to execute the remainder.
3104   BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ?
3105     BranchInst::Create(LoopScalarPreHeader) :
3106     BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
3107                        Builder.getTrue());
3108   BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3109   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3110 
3111   // Update dominator for loop exit. During skeleton creation, only the vector
3112   // pre-header and the middle block are created. The vector loop is entirely
3113   // created during VPlan exection.
3114   if (!Cost->requiresScalarEpilogue(VF))
3115     // If there is an epilogue which must run, there's no edge from the
3116     // middle block to exit blocks  and thus no need to update the immediate
3117     // dominator of the exit blocks.
3118     DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3119 }
3120 
3121 void InnerLoopVectorizer::createInductionResumeValues(
3122     std::pair<BasicBlock *, Value *> AdditionalBypass) {
3123   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3124           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3125          "Inconsistent information about additional bypass.");
3126 
3127   Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3128   assert(VectorTripCount && "Expected valid arguments");
3129   // We are going to resume the execution of the scalar loop.
3130   // Go over all of the induction variables that we found and fix the
3131   // PHIs that are left in the scalar version of the loop.
3132   // The starting values of PHI nodes depend on the counter of the last
3133   // iteration in the vectorized loop.
3134   // If we come from a bypass edge then we need to start from the original
3135   // start value.
3136   Instruction *OldInduction = Legal->getPrimaryInduction();
3137   for (auto &InductionEntry : Legal->getInductionVars()) {
3138     PHINode *OrigPhi = InductionEntry.first;
3139     InductionDescriptor II = InductionEntry.second;
3140 
3141     // Create phi nodes to merge from the  backedge-taken check block.
3142     PHINode *BCResumeVal =
3143         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3144                         LoopScalarPreHeader->getTerminator());
3145     // Copy original phi DL over to the new one.
3146     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3147     Value *&EndValue = IVEndValues[OrigPhi];
3148     Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3149     if (OrigPhi == OldInduction) {
3150       // We know what the end value is.
3151       EndValue = VectorTripCount;
3152     } else {
3153       IRBuilder<> B(LoopVectorPreHeader->getTerminator());
3154 
3155       // Fast-math-flags propagate from the original induction instruction.
3156       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3157         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3158 
3159       Type *StepType = II.getStep()->getType();
3160       Instruction::CastOps CastOp =
3161           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3162       Value *VTC = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.vtc");
3163       Value *Step =
3164           CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
3165       EndValue = emitTransformedIndex(B, VTC, II.getStartValue(), Step, II);
3166       EndValue->setName("ind.end");
3167 
3168       // Compute the end value for the additional bypass (if applicable).
3169       if (AdditionalBypass.first) {
3170         B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3171         CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
3172                                          StepType, true);
3173         Value *Step =
3174             CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
3175         VTC =
3176             B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.vtc");
3177         EndValueFromAdditionalBypass =
3178             emitTransformedIndex(B, VTC, II.getStartValue(), Step, II);
3179         EndValueFromAdditionalBypass->setName("ind.end");
3180       }
3181     }
3182     // The new PHI merges the original incoming value, in case of a bypass,
3183     // or the value at the end of the vectorized loop.
3184     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3185 
3186     // Fix the scalar body counter (PHI node).
3187     // The old induction's phi node in the scalar body needs the truncated
3188     // value.
3189     for (BasicBlock *BB : LoopBypassBlocks)
3190       BCResumeVal->addIncoming(II.getStartValue(), BB);
3191 
3192     if (AdditionalBypass.first)
3193       BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3194                                             EndValueFromAdditionalBypass);
3195 
3196     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3197   }
3198 }
3199 
3200 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(MDNode *OrigLoopID) {
3201   // The trip counts should be cached by now.
3202   Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
3203   Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3204 
3205   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3206 
3207   // Add a check in the middle block to see if we have completed
3208   // all of the iterations in the first vector loop.  Three cases:
3209   // 1) If we require a scalar epilogue, there is no conditional branch as
3210   //    we unconditionally branch to the scalar preheader.  Do nothing.
3211   // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3212   //    Thus if tail is to be folded, we know we don't need to run the
3213   //    remainder and we can use the previous value for the condition (true).
3214   // 3) Otherwise, construct a runtime check.
3215   if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) {
3216     Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3217                                         Count, VectorTripCount, "cmp.n",
3218                                         LoopMiddleBlock->getTerminator());
3219 
3220     // Here we use the same DebugLoc as the scalar loop latch terminator instead
3221     // of the corresponding compare because they may have ended up with
3222     // different line numbers and we want to avoid awkward line stepping while
3223     // debugging. Eg. if the compare has got a line number inside the loop.
3224     CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3225     cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3226   }
3227 
3228 #ifdef EXPENSIVE_CHECKS
3229   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3230 #endif
3231 
3232   return LoopVectorPreHeader;
3233 }
3234 
3235 std::pair<BasicBlock *, Value *>
3236 InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3237   /*
3238    In this function we generate a new loop. The new loop will contain
3239    the vectorized instructions while the old loop will continue to run the
3240    scalar remainder.
3241 
3242        [ ] <-- loop iteration number check.
3243     /   |
3244    /    v
3245   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3246   |  /  |
3247   | /   v
3248   ||   [ ]     <-- vector pre header.
3249   |/    |
3250   |     v
3251   |    [  ] \
3252   |    [  ]_|   <-- vector loop (created during VPlan execution).
3253   |     |
3254   |     v
3255   \   -[ ]   <--- middle-block.
3256    \/   |
3257    /\   v
3258    | ->[ ]     <--- new preheader.
3259    |    |
3260  (opt)  v      <-- edge from middle to exit iff epilogue is not required.
3261    |   [ ] \
3262    |   [ ]_|   <-- old scalar loop to handle remainder (scalar epilogue).
3263     \   |
3264      \  v
3265       >[ ]     <-- exit block(s).
3266    ...
3267    */
3268 
3269   // Get the metadata of the original loop before it gets modified.
3270   MDNode *OrigLoopID = OrigLoop->getLoopID();
3271 
3272   // Workaround!  Compute the trip count of the original loop and cache it
3273   // before we start modifying the CFG.  This code has a systemic problem
3274   // wherein it tries to run analysis over partially constructed IR; this is
3275   // wrong, and not simply for SCEV.  The trip count of the original loop
3276   // simply happens to be prone to hitting this in practice.  In theory, we
3277   // can hit the same issue for any SCEV, or ValueTracking query done during
3278   // mutation.  See PR49900.
3279   getOrCreateTripCount(OrigLoop->getLoopPreheader());
3280 
3281   // Create an empty vector loop, and prepare basic blocks for the runtime
3282   // checks.
3283   createVectorLoopSkeleton("");
3284 
3285   // Now, compare the new count to zero. If it is zero skip the vector loop and
3286   // jump to the scalar loop. This check also covers the case where the
3287   // backedge-taken count is uint##_max: adding one to it will overflow leading
3288   // to an incorrect trip count of zero. In this (rare) case we will also jump
3289   // to the scalar loop.
3290   emitIterationCountCheck(LoopScalarPreHeader);
3291 
3292   // Generate the code to check any assumptions that we've made for SCEV
3293   // expressions.
3294   emitSCEVChecks(LoopScalarPreHeader);
3295 
3296   // Generate the code that checks in runtime if arrays overlap. We put the
3297   // checks into a separate block to make the more common case of few elements
3298   // faster.
3299   emitMemRuntimeChecks(LoopScalarPreHeader);
3300 
3301   // Emit phis for the new starting index of the scalar loop.
3302   createInductionResumeValues();
3303 
3304   return {completeLoopSkeleton(OrigLoopID), nullptr};
3305 }
3306 
3307 // Fix up external users of the induction variable. At this point, we are
3308 // in LCSSA form, with all external PHIs that use the IV having one input value,
3309 // coming from the remainder loop. We need those PHIs to also have a correct
3310 // value for the IV when arriving directly from the middle block.
3311 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3312                                        const InductionDescriptor &II,
3313                                        Value *VectorTripCount, Value *EndValue,
3314                                        BasicBlock *MiddleBlock,
3315                                        BasicBlock *VectorHeader, VPlan &Plan) {
3316   // There are two kinds of external IV usages - those that use the value
3317   // computed in the last iteration (the PHI) and those that use the penultimate
3318   // value (the value that feeds into the phi from the loop latch).
3319   // We allow both, but they, obviously, have different values.
3320 
3321   assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3322 
3323   DenseMap<Value *, Value *> MissingVals;
3324 
3325   // An external user of the last iteration's value should see the value that
3326   // the remainder loop uses to initialize its own IV.
3327   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3328   for (User *U : PostInc->users()) {
3329     Instruction *UI = cast<Instruction>(U);
3330     if (!OrigLoop->contains(UI)) {
3331       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3332       MissingVals[UI] = EndValue;
3333     }
3334   }
3335 
3336   // An external user of the penultimate value need to see EndValue - Step.
3337   // The simplest way to get this is to recompute it from the constituent SCEVs,
3338   // that is Start + (Step * (CRD - 1)).
3339   for (User *U : OrigPhi->users()) {
3340     auto *UI = cast<Instruction>(U);
3341     if (!OrigLoop->contains(UI)) {
3342       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3343 
3344       IRBuilder<> B(MiddleBlock->getTerminator());
3345 
3346       // Fast-math-flags propagate from the original induction instruction.
3347       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3348         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3349 
3350       Value *CountMinusOne = B.CreateSub(
3351           VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1));
3352       Value *CMO =
3353           !II.getStep()->getType()->isIntegerTy()
3354               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3355                              II.getStep()->getType())
3356               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3357       CMO->setName("cast.cmo");
3358 
3359       Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(),
3360                                     VectorHeader->getTerminator());
3361       Value *Escape =
3362           emitTransformedIndex(B, CMO, II.getStartValue(), Step, II);
3363       Escape->setName("ind.escape");
3364       MissingVals[UI] = Escape;
3365     }
3366   }
3367 
3368   for (auto &I : MissingVals) {
3369     PHINode *PHI = cast<PHINode>(I.first);
3370     // One corner case we have to handle is two IVs "chasing" each-other,
3371     // that is %IV2 = phi [...], [ %IV1, %latch ]
3372     // In this case, if IV1 has an external use, we need to avoid adding both
3373     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3374     // don't already have an incoming value for the middle block.
3375     if (PHI->getBasicBlockIndex(MiddleBlock) == -1) {
3376       PHI->addIncoming(I.second, MiddleBlock);
3377       Plan.removeLiveOut(PHI);
3378     }
3379   }
3380 }
3381 
3382 namespace {
3383 
3384 struct CSEDenseMapInfo {
3385   static bool canHandle(const Instruction *I) {
3386     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3387            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3388   }
3389 
3390   static inline Instruction *getEmptyKey() {
3391     return DenseMapInfo<Instruction *>::getEmptyKey();
3392   }
3393 
3394   static inline Instruction *getTombstoneKey() {
3395     return DenseMapInfo<Instruction *>::getTombstoneKey();
3396   }
3397 
3398   static unsigned getHashValue(const Instruction *I) {
3399     assert(canHandle(I) && "Unknown instruction!");
3400     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3401                                                            I->value_op_end()));
3402   }
3403 
3404   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3405     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3406         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3407       return LHS == RHS;
3408     return LHS->isIdenticalTo(RHS);
3409   }
3410 };
3411 
3412 } // end anonymous namespace
3413 
3414 ///Perform cse of induction variable instructions.
3415 static void cse(BasicBlock *BB) {
3416   // Perform simple cse.
3417   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3418   for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3419     if (!CSEDenseMapInfo::canHandle(&In))
3420       continue;
3421 
3422     // Check if we can replace this instruction with any of the
3423     // visited instructions.
3424     if (Instruction *V = CSEMap.lookup(&In)) {
3425       In.replaceAllUsesWith(V);
3426       In.eraseFromParent();
3427       continue;
3428     }
3429 
3430     CSEMap[&In] = &In;
3431   }
3432 }
3433 
3434 InstructionCost
3435 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
3436                                               bool &NeedToScalarize) const {
3437   Function *F = CI->getCalledFunction();
3438   Type *ScalarRetTy = CI->getType();
3439   SmallVector<Type *, 4> Tys, ScalarTys;
3440   for (auto &ArgOp : CI->args())
3441     ScalarTys.push_back(ArgOp->getType());
3442 
3443   // Estimate cost of scalarized vector call. The source operands are assumed
3444   // to be vectors, so we need to extract individual elements from there,
3445   // execute VF scalar calls, and then gather the result into the vector return
3446   // value.
3447   InstructionCost ScalarCallCost =
3448       TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
3449   if (VF.isScalar())
3450     return ScalarCallCost;
3451 
3452   // Compute corresponding vector type for return value and arguments.
3453   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3454   for (Type *ScalarTy : ScalarTys)
3455     Tys.push_back(ToVectorTy(ScalarTy, VF));
3456 
3457   // Compute costs of unpacking argument values for the scalar calls and
3458   // packing the return values to a vector.
3459   InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
3460 
3461   InstructionCost Cost =
3462       ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3463 
3464   // If we can't emit a vector call for this function, then the currently found
3465   // cost is the cost we need to return.
3466   NeedToScalarize = true;
3467   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3468   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3469 
3470   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3471     return Cost;
3472 
3473   // If the corresponding vector cost is cheaper, return its cost.
3474   InstructionCost VectorCallCost =
3475       TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
3476   if (VectorCallCost < Cost) {
3477     NeedToScalarize = false;
3478     Cost = VectorCallCost;
3479   }
3480   return Cost;
3481 }
3482 
3483 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3484   if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3485     return Elt;
3486   return VectorType::get(Elt, VF);
3487 }
3488 
3489 InstructionCost
3490 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3491                                                    ElementCount VF) const {
3492   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3493   assert(ID && "Expected intrinsic call!");
3494   Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3495   FastMathFlags FMF;
3496   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3497     FMF = FPMO->getFastMathFlags();
3498 
3499   SmallVector<const Value *> Arguments(CI->args());
3500   FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3501   SmallVector<Type *> ParamTys;
3502   std::transform(FTy->param_begin(), FTy->param_end(),
3503                  std::back_inserter(ParamTys),
3504                  [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3505 
3506   IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3507                                     dyn_cast<IntrinsicInst>(CI));
3508   return TTI.getIntrinsicInstrCost(CostAttrs,
3509                                    TargetTransformInfo::TCK_RecipThroughput);
3510 }
3511 
3512 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3513   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3514   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3515   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3516 }
3517 
3518 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3519   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3520   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3521   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3522 }
3523 
3524 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
3525   // For every instruction `I` in MinBWs, truncate the operands, create a
3526   // truncated version of `I` and reextend its result. InstCombine runs
3527   // later and will remove any ext/trunc pairs.
3528   SmallPtrSet<Value *, 4> Erased;
3529   for (const auto &KV : Cost->getMinimalBitwidths()) {
3530     // If the value wasn't vectorized, we must maintain the original scalar
3531     // type. The absence of the value from State indicates that it
3532     // wasn't vectorized.
3533     // FIXME: Should not rely on getVPValue at this point.
3534     VPValue *Def = State.Plan->getVPValue(KV.first, true);
3535     if (!State.hasAnyVectorValue(Def))
3536       continue;
3537     for (unsigned Part = 0; Part < UF; ++Part) {
3538       Value *I = State.get(Def, Part);
3539       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3540         continue;
3541       Type *OriginalTy = I->getType();
3542       Type *ScalarTruncatedTy =
3543           IntegerType::get(OriginalTy->getContext(), KV.second);
3544       auto *TruncatedTy = VectorType::get(
3545           ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount());
3546       if (TruncatedTy == OriginalTy)
3547         continue;
3548 
3549       IRBuilder<> B(cast<Instruction>(I));
3550       auto ShrinkOperand = [&](Value *V) -> Value * {
3551         if (auto *ZI = dyn_cast<ZExtInst>(V))
3552           if (ZI->getSrcTy() == TruncatedTy)
3553             return ZI->getOperand(0);
3554         return B.CreateZExtOrTrunc(V, TruncatedTy);
3555       };
3556 
3557       // The actual instruction modification depends on the instruction type,
3558       // unfortunately.
3559       Value *NewI = nullptr;
3560       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3561         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3562                              ShrinkOperand(BO->getOperand(1)));
3563 
3564         // Any wrapping introduced by shrinking this operation shouldn't be
3565         // considered undefined behavior. So, we can't unconditionally copy
3566         // arithmetic wrapping flags to NewI.
3567         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3568       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3569         NewI =
3570             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3571                          ShrinkOperand(CI->getOperand(1)));
3572       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3573         NewI = B.CreateSelect(SI->getCondition(),
3574                               ShrinkOperand(SI->getTrueValue()),
3575                               ShrinkOperand(SI->getFalseValue()));
3576       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3577         switch (CI->getOpcode()) {
3578         default:
3579           llvm_unreachable("Unhandled cast!");
3580         case Instruction::Trunc:
3581           NewI = ShrinkOperand(CI->getOperand(0));
3582           break;
3583         case Instruction::SExt:
3584           NewI = B.CreateSExtOrTrunc(
3585               CI->getOperand(0),
3586               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3587           break;
3588         case Instruction::ZExt:
3589           NewI = B.CreateZExtOrTrunc(
3590               CI->getOperand(0),
3591               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3592           break;
3593         }
3594       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3595         auto Elements0 =
3596             cast<VectorType>(SI->getOperand(0)->getType())->getElementCount();
3597         auto *O0 = B.CreateZExtOrTrunc(
3598             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3599         auto Elements1 =
3600             cast<VectorType>(SI->getOperand(1)->getType())->getElementCount();
3601         auto *O1 = B.CreateZExtOrTrunc(
3602             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3603 
3604         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3605       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3606         // Don't do anything with the operands, just extend the result.
3607         continue;
3608       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3609         auto Elements =
3610             cast<VectorType>(IE->getOperand(0)->getType())->getElementCount();
3611         auto *O0 = B.CreateZExtOrTrunc(
3612             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3613         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3614         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3615       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3616         auto Elements =
3617             cast<VectorType>(EE->getOperand(0)->getType())->getElementCount();
3618         auto *O0 = B.CreateZExtOrTrunc(
3619             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3620         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3621       } else {
3622         // If we don't know what to do, be conservative and don't do anything.
3623         continue;
3624       }
3625 
3626       // Lastly, extend the result.
3627       NewI->takeName(cast<Instruction>(I));
3628       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3629       I->replaceAllUsesWith(Res);
3630       cast<Instruction>(I)->eraseFromParent();
3631       Erased.insert(I);
3632       State.reset(Def, Res, Part);
3633     }
3634   }
3635 
3636   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3637   for (const auto &KV : Cost->getMinimalBitwidths()) {
3638     // If the value wasn't vectorized, we must maintain the original scalar
3639     // type. The absence of the value from State indicates that it
3640     // wasn't vectorized.
3641     // FIXME: Should not rely on getVPValue at this point.
3642     VPValue *Def = State.Plan->getVPValue(KV.first, true);
3643     if (!State.hasAnyVectorValue(Def))
3644       continue;
3645     for (unsigned Part = 0; Part < UF; ++Part) {
3646       Value *I = State.get(Def, Part);
3647       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3648       if (Inst && Inst->use_empty()) {
3649         Value *NewI = Inst->getOperand(0);
3650         Inst->eraseFromParent();
3651         State.reset(Def, NewI, Part);
3652       }
3653     }
3654   }
3655 }
3656 
3657 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
3658                                             VPlan &Plan) {
3659   // Insert truncates and extends for any truncated instructions as hints to
3660   // InstCombine.
3661   if (VF.isVector())
3662     truncateToMinimalBitwidths(State);
3663 
3664   // Fix widened non-induction PHIs by setting up the PHI operands.
3665   if (EnableVPlanNativePath)
3666     fixNonInductionPHIs(Plan, State);
3667 
3668   // At this point every instruction in the original loop is widened to a
3669   // vector form. Now we need to fix the recurrences in the loop. These PHI
3670   // nodes are currently empty because we did not want to introduce cycles.
3671   // This is the second stage of vectorizing recurrences.
3672   fixCrossIterationPHIs(State);
3673 
3674   // Forget the original basic block.
3675   PSE.getSE()->forgetLoop(OrigLoop);
3676 
3677   VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock();
3678   Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
3679   if (Cost->requiresScalarEpilogue(VF)) {
3680     // No edge from the middle block to the unique exit block has been inserted
3681     // and there is nothing to fix from vector loop; phis should have incoming
3682     // from scalar loop only.
3683     Plan.clearLiveOuts();
3684   } else {
3685     // If we inserted an edge from the middle block to the unique exit block,
3686     // update uses outside the loop (phis) to account for the newly inserted
3687     // edge.
3688 
3689     // Fix-up external users of the induction variables.
3690     for (auto &Entry : Legal->getInductionVars())
3691       fixupIVUsers(Entry.first, Entry.second,
3692                    getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()),
3693                    IVEndValues[Entry.first], LoopMiddleBlock,
3694                    VectorLoop->getHeader(), Plan);
3695   }
3696 
3697   // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated
3698   // in the exit block, so update the builder.
3699   State.Builder.SetInsertPoint(State.CFG.ExitBB->getFirstNonPHI());
3700   for (auto &KV : Plan.getLiveOuts())
3701     KV.second->fixPhi(Plan, State);
3702 
3703   for (Instruction *PI : PredicatedInstructions)
3704     sinkScalarOperands(&*PI);
3705 
3706   // Remove redundant induction instructions.
3707   cse(VectorLoop->getHeader());
3708 
3709   // Set/update profile weights for the vector and remainder loops as original
3710   // loop iterations are now distributed among them. Note that original loop
3711   // represented by LoopScalarBody becomes remainder loop after vectorization.
3712   //
3713   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3714   // end up getting slightly roughened result but that should be OK since
3715   // profile is not inherently precise anyway. Note also possible bypass of
3716   // vector code caused by legality checks is ignored, assigning all the weight
3717   // to the vector loop, optimistically.
3718   //
3719   // For scalable vectorization we can't know at compile time how many iterations
3720   // of the loop are handled in one vector iteration, so instead assume a pessimistic
3721   // vscale of '1'.
3722   setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop,
3723                                LI->getLoopFor(LoopScalarBody),
3724                                VF.getKnownMinValue() * UF);
3725 }
3726 
3727 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
3728   // In order to support recurrences we need to be able to vectorize Phi nodes.
3729   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3730   // stage #2: We now need to fix the recurrences by adding incoming edges to
3731   // the currently empty PHI nodes. At this point every instruction in the
3732   // original loop is widened to a vector form so we can use them to construct
3733   // the incoming edges.
3734   VPBasicBlock *Header =
3735       State.Plan->getVectorLoopRegion()->getEntryBasicBlock();
3736   for (VPRecipeBase &R : Header->phis()) {
3737     if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
3738       fixReduction(ReductionPhi, State);
3739     else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
3740       fixFirstOrderRecurrence(FOR, State);
3741   }
3742 }
3743 
3744 void InnerLoopVectorizer::fixFirstOrderRecurrence(
3745     VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {
3746   // This is the second phase of vectorizing first-order recurrences. An
3747   // overview of the transformation is described below. Suppose we have the
3748   // following loop.
3749   //
3750   //   for (int i = 0; i < n; ++i)
3751   //     b[i] = a[i] - a[i - 1];
3752   //
3753   // There is a first-order recurrence on "a". For this loop, the shorthand
3754   // scalar IR looks like:
3755   //
3756   //   scalar.ph:
3757   //     s_init = a[-1]
3758   //     br scalar.body
3759   //
3760   //   scalar.body:
3761   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3762   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3763   //     s2 = a[i]
3764   //     b[i] = s2 - s1
3765   //     br cond, scalar.body, ...
3766   //
3767   // In this example, s1 is a recurrence because it's value depends on the
3768   // previous iteration. In the first phase of vectorization, we created a
3769   // vector phi v1 for s1. We now complete the vectorization and produce the
3770   // shorthand vector IR shown below (for VF = 4, UF = 1).
3771   //
3772   //   vector.ph:
3773   //     v_init = vector(..., ..., ..., a[-1])
3774   //     br vector.body
3775   //
3776   //   vector.body
3777   //     i = phi [0, vector.ph], [i+4, vector.body]
3778   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3779   //     v2 = a[i, i+1, i+2, i+3];
3780   //     v3 = vector(v1(3), v2(0, 1, 2))
3781   //     b[i, i+1, i+2, i+3] = v2 - v3
3782   //     br cond, vector.body, middle.block
3783   //
3784   //   middle.block:
3785   //     x = v2(3)
3786   //     br scalar.ph
3787   //
3788   //   scalar.ph:
3789   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3790   //     br scalar.body
3791   //
3792   // After execution completes the vector loop, we extract the next value of
3793   // the recurrence (x) to use as the initial value in the scalar loop.
3794 
3795   // Extract the last vector element in the middle block. This will be the
3796   // initial value for the recurrence when jumping to the scalar loop.
3797   VPValue *PreviousDef = PhiR->getBackedgeValue();
3798   Value *Incoming = State.get(PreviousDef, UF - 1);
3799   auto *ExtractForScalar = Incoming;
3800   auto *IdxTy = Builder.getInt32Ty();
3801   if (VF.isVector()) {
3802     auto *One = ConstantInt::get(IdxTy, 1);
3803     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3804     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3805     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
3806     ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx,
3807                                                     "vector.recur.extract");
3808   }
3809   // Extract the second last element in the middle block if the
3810   // Phi is used outside the loop. We need to extract the phi itself
3811   // and not the last element (the phi update in the current iteration). This
3812   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3813   // when the scalar loop is not run at all.
3814   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3815   if (VF.isVector()) {
3816     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3817     auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
3818     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3819         Incoming, Idx, "vector.recur.extract.for.phi");
3820   } else if (UF > 1)
3821     // When loop is unrolled without vectorizing, initialize
3822     // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value
3823     // of `Incoming`. This is analogous to the vectorized case above: extracting
3824     // the second last element when VF > 1.
3825     ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
3826 
3827   // Fix the initial value of the original recurrence in the scalar loop.
3828   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3829   PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
3830   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3831   auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
3832   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3833     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3834     Start->addIncoming(Incoming, BB);
3835   }
3836 
3837   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3838   Phi->setName("scalar.recur");
3839 
3840   // Finally, fix users of the recurrence outside the loop. The users will need
3841   // either the last value of the scalar recurrence or the last value of the
3842   // vector recurrence we extracted in the middle block. Since the loop is in
3843   // LCSSA form, we just need to find all the phi nodes for the original scalar
3844   // recurrence in the exit block, and then add an edge for the middle block.
3845   // Note that LCSSA does not imply single entry when the original scalar loop
3846   // had multiple exiting edges (as we always run the last iteration in the
3847   // scalar epilogue); in that case, there is no edge from middle to exit and
3848   // and thus no phis which needed updated.
3849   if (!Cost->requiresScalarEpilogue(VF))
3850     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
3851       if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) {
3852         LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3853         State.Plan->removeLiveOut(&LCSSAPhi);
3854       }
3855 }
3856 
3857 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
3858                                        VPTransformState &State) {
3859   PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
3860   // Get it's reduction variable descriptor.
3861   assert(Legal->isReductionVariable(OrigPhi) &&
3862          "Unable to find the reduction variable");
3863   const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
3864 
3865   RecurKind RK = RdxDesc.getRecurrenceKind();
3866   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3867   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3868   setDebugLocFromInst(ReductionStartValue);
3869 
3870   VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
3871   // This is the vector-clone of the value that leaves the loop.
3872   Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
3873 
3874   // Wrap flags are in general invalid after vectorization, clear them.
3875   clearReductionWrapFlags(PhiR, State);
3876 
3877   // Before each round, move the insertion point right between
3878   // the PHIs and the values we are going to write.
3879   // This allows us to write both PHINodes and the extractelement
3880   // instructions.
3881   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3882 
3883   setDebugLocFromInst(LoopExitInst);
3884 
3885   Type *PhiTy = OrigPhi->getType();
3886 
3887   VPBasicBlock *LatchVPBB =
3888       PhiR->getParent()->getEnclosingLoopRegion()->getExitingBasicBlock();
3889   BasicBlock *VectorLoopLatch = State.CFG.VPBB2IRBB[LatchVPBB];
3890   // If tail is folded by masking, the vector value to leave the loop should be
3891   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3892   // instead of the former. For an inloop reduction the reduction will already
3893   // be predicated, and does not need to be handled here.
3894   if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
3895     for (unsigned Part = 0; Part < UF; ++Part) {
3896       Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
3897       SelectInst *Sel = nullptr;
3898       for (User *U : VecLoopExitInst->users()) {
3899         if (isa<SelectInst>(U)) {
3900           assert(!Sel && "Reduction exit feeding two selects");
3901           Sel = cast<SelectInst>(U);
3902         } else
3903           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3904       }
3905       assert(Sel && "Reduction exit feeds no select");
3906       State.reset(LoopExitInstDef, Sel, Part);
3907 
3908       if (isa<FPMathOperator>(Sel))
3909         Sel->setFastMathFlags(RdxDesc.getFastMathFlags());
3910 
3911       // If the target can create a predicated operator for the reduction at no
3912       // extra cost in the loop (for example a predicated vadd), it can be
3913       // cheaper for the select to remain in the loop than be sunk out of it,
3914       // and so use the select value for the phi instead of the old
3915       // LoopExitValue.
3916       if (PreferPredicatedReductionSelect ||
3917           TTI->preferPredicatedReductionSelect(
3918               RdxDesc.getOpcode(), PhiTy,
3919               TargetTransformInfo::ReductionFlags())) {
3920         auto *VecRdxPhi =
3921             cast<PHINode>(State.get(PhiR, Part));
3922         VecRdxPhi->setIncomingValueForBlock(VectorLoopLatch, Sel);
3923       }
3924     }
3925   }
3926 
3927   // If the vector reduction can be performed in a smaller type, we truncate
3928   // then extend the loop exit value to enable InstCombine to evaluate the
3929   // entire expression in the smaller type.
3930   if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
3931     assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
3932     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3933     Builder.SetInsertPoint(VectorLoopLatch->getTerminator());
3934     VectorParts RdxParts(UF);
3935     for (unsigned Part = 0; Part < UF; ++Part) {
3936       RdxParts[Part] = State.get(LoopExitInstDef, Part);
3937       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3938       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3939                                         : Builder.CreateZExt(Trunc, VecTy);
3940       for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users()))
3941         if (U != Trunc) {
3942           U->replaceUsesOfWith(RdxParts[Part], Extnd);
3943           RdxParts[Part] = Extnd;
3944         }
3945     }
3946     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3947     for (unsigned Part = 0; Part < UF; ++Part) {
3948       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3949       State.reset(LoopExitInstDef, RdxParts[Part], Part);
3950     }
3951   }
3952 
3953   // Reduce all of the unrolled parts into a single vector.
3954   Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
3955   unsigned Op = RecurrenceDescriptor::getOpcode(RK);
3956 
3957   // The middle block terminator has already been assigned a DebugLoc here (the
3958   // OrigLoop's single latch terminator). We want the whole middle block to
3959   // appear to execute on this line because: (a) it is all compiler generated,
3960   // (b) these instructions are always executed after evaluating the latch
3961   // conditional branch, and (c) other passes may add new predecessors which
3962   // terminate on this line. This is the easiest way to ensure we don't
3963   // accidentally cause an extra step back into the loop while debugging.
3964   setDebugLocFromInst(LoopMiddleBlock->getTerminator());
3965   if (PhiR->isOrdered())
3966     ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
3967   else {
3968     // Floating-point operations should have some FMF to enable the reduction.
3969     IRBuilderBase::FastMathFlagGuard FMFG(Builder);
3970     Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
3971     for (unsigned Part = 1; Part < UF; ++Part) {
3972       Value *RdxPart = State.get(LoopExitInstDef, Part);
3973       if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
3974         ReducedPartRdx = Builder.CreateBinOp(
3975             (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
3976       } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK))
3977         ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK,
3978                                            ReducedPartRdx, RdxPart);
3979       else
3980         ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
3981     }
3982   }
3983 
3984   // Create the reduction after the loop. Note that inloop reductions create the
3985   // target reduction in the loop using a Reduction recipe.
3986   if (VF.isVector() && !PhiR->isInLoop()) {
3987     ReducedPartRdx =
3988         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi);
3989     // If the reduction can be performed in a smaller type, we need to extend
3990     // the reduction to the wider type before we branch to the original loop.
3991     if (PhiTy != RdxDesc.getRecurrenceType())
3992       ReducedPartRdx = RdxDesc.isSigned()
3993                            ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
3994                            : Builder.CreateZExt(ReducedPartRdx, PhiTy);
3995   }
3996 
3997   PHINode *ResumePhi =
3998       dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
3999 
4000   // Create a phi node that merges control-flow from the backedge-taken check
4001   // block and the middle block.
4002   PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
4003                                         LoopScalarPreHeader->getTerminator());
4004 
4005   // If we are fixing reductions in the epilogue loop then we should already
4006   // have created a bc.merge.rdx Phi after the main vector body. Ensure that
4007   // we carry over the incoming values correctly.
4008   for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
4009     if (Incoming == LoopMiddleBlock)
4010       BCBlockPhi->addIncoming(ReducedPartRdx, Incoming);
4011     else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming))
4012       BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
4013                               Incoming);
4014     else
4015       BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
4016   }
4017 
4018   // Set the resume value for this reduction
4019   ReductionResumeValues.insert({&RdxDesc, BCBlockPhi});
4020 
4021   // If there were stores of the reduction value to a uniform memory address
4022   // inside the loop, create the final store here.
4023   if (StoreInst *SI = RdxDesc.IntermediateStore) {
4024     StoreInst *NewSI =
4025         Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand());
4026     propagateMetadata(NewSI, SI);
4027 
4028     // If the reduction value is used in other places,
4029     // then let the code below create PHI's for that.
4030   }
4031 
4032   // Now, we need to fix the users of the reduction variable
4033   // inside and outside of the scalar remainder loop.
4034 
4035   // We know that the loop is in LCSSA form. We need to update the PHI nodes
4036   // in the exit blocks.  See comment on analogous loop in
4037   // fixFirstOrderRecurrence for a more complete explaination of the logic.
4038   if (!Cost->requiresScalarEpilogue(VF))
4039     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4040       if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) {
4041         LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4042         State.Plan->removeLiveOut(&LCSSAPhi);
4043       }
4044 
4045   // Fix the scalar loop reduction variable with the incoming reduction sum
4046   // from the vector body and from the backedge value.
4047   int IncomingEdgeBlockIdx =
4048       OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4049   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4050   // Pick the other block.
4051   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4052   OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4053   OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4054 }
4055 
4056 void InnerLoopVectorizer::clearReductionWrapFlags(VPReductionPHIRecipe *PhiR,
4057                                                   VPTransformState &State) {
4058   const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
4059   RecurKind RK = RdxDesc.getRecurrenceKind();
4060   if (RK != RecurKind::Add && RK != RecurKind::Mul)
4061     return;
4062 
4063   SmallVector<VPValue *, 8> Worklist;
4064   SmallPtrSet<VPValue *, 8> Visited;
4065   Worklist.push_back(PhiR);
4066   Visited.insert(PhiR);
4067 
4068   while (!Worklist.empty()) {
4069     VPValue *Cur = Worklist.pop_back_val();
4070     for (unsigned Part = 0; Part < UF; ++Part) {
4071       Value *V = State.get(Cur, Part);
4072       if (!isa<OverflowingBinaryOperator>(V))
4073         break;
4074       cast<Instruction>(V)->dropPoisonGeneratingFlags();
4075       }
4076 
4077       for (VPUser *U : Cur->users()) {
4078         auto *UserRecipe = dyn_cast<VPRecipeBase>(U);
4079         if (!UserRecipe)
4080           continue;
4081         for (VPValue *V : UserRecipe->definedValues())
4082           if (Visited.insert(V).second)
4083             Worklist.push_back(V);
4084       }
4085   }
4086 }
4087 
4088 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4089   // The basic block and loop containing the predicated instruction.
4090   auto *PredBB = PredInst->getParent();
4091   auto *VectorLoop = LI->getLoopFor(PredBB);
4092 
4093   // Initialize a worklist with the operands of the predicated instruction.
4094   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4095 
4096   // Holds instructions that we need to analyze again. An instruction may be
4097   // reanalyzed if we don't yet know if we can sink it or not.
4098   SmallVector<Instruction *, 8> InstsToReanalyze;
4099 
4100   // Returns true if a given use occurs in the predicated block. Phi nodes use
4101   // their operands in their corresponding predecessor blocks.
4102   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4103     auto *I = cast<Instruction>(U.getUser());
4104     BasicBlock *BB = I->getParent();
4105     if (auto *Phi = dyn_cast<PHINode>(I))
4106       BB = Phi->getIncomingBlock(
4107           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4108     return BB == PredBB;
4109   };
4110 
4111   // Iteratively sink the scalarized operands of the predicated instruction
4112   // into the block we created for it. When an instruction is sunk, it's
4113   // operands are then added to the worklist. The algorithm ends after one pass
4114   // through the worklist doesn't sink a single instruction.
4115   bool Changed;
4116   do {
4117     // Add the instructions that need to be reanalyzed to the worklist, and
4118     // reset the changed indicator.
4119     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4120     InstsToReanalyze.clear();
4121     Changed = false;
4122 
4123     while (!Worklist.empty()) {
4124       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4125 
4126       // We can't sink an instruction if it is a phi node, is not in the loop,
4127       // or may have side effects.
4128       if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
4129           I->mayHaveSideEffects())
4130         continue;
4131 
4132       // If the instruction is already in PredBB, check if we can sink its
4133       // operands. In that case, VPlan's sinkScalarOperands() succeeded in
4134       // sinking the scalar instruction I, hence it appears in PredBB; but it
4135       // may have failed to sink I's operands (recursively), which we try
4136       // (again) here.
4137       if (I->getParent() == PredBB) {
4138         Worklist.insert(I->op_begin(), I->op_end());
4139         continue;
4140       }
4141 
4142       // It's legal to sink the instruction if all its uses occur in the
4143       // predicated block. Otherwise, there's nothing to do yet, and we may
4144       // need to reanalyze the instruction.
4145       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4146         InstsToReanalyze.push_back(I);
4147         continue;
4148       }
4149 
4150       // Move the instruction to the beginning of the predicated block, and add
4151       // it's operands to the worklist.
4152       I->moveBefore(&*PredBB->getFirstInsertionPt());
4153       Worklist.insert(I->op_begin(), I->op_end());
4154 
4155       // The sinking may have enabled other instructions to be sunk, so we will
4156       // need to iterate.
4157       Changed = true;
4158     }
4159   } while (Changed);
4160 }
4161 
4162 void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan,
4163                                               VPTransformState &State) {
4164   auto Iter = depth_first(
4165       VPBlockRecursiveTraversalWrapper<VPBlockBase *>(Plan.getEntry()));
4166   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
4167     for (VPRecipeBase &P : VPBB->phis()) {
4168       VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
4169       if (!VPPhi)
4170         continue;
4171       PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
4172       // Make sure the builder has a valid insert point.
4173       Builder.SetInsertPoint(NewPhi);
4174       for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
4175         VPValue *Inc = VPPhi->getIncomingValue(i);
4176         VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
4177         NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
4178       }
4179     }
4180   }
4181 }
4182 
4183 bool InnerLoopVectorizer::useOrderedReductions(
4184     const RecurrenceDescriptor &RdxDesc) {
4185   return Cost->useOrderedReductions(RdxDesc);
4186 }
4187 
4188 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
4189                                               VPWidenPHIRecipe *PhiR,
4190                                               VPTransformState &State) {
4191   assert(EnableVPlanNativePath &&
4192          "Non-native vplans are not expected to have VPWidenPHIRecipes.");
4193   // Currently we enter here in the VPlan-native path for non-induction
4194   // PHIs where all control flow is uniform. We simply widen these PHIs.
4195   // Create a vector phi with no operands - the vector phi operands will be
4196   // set at the end of vector code generation.
4197   Type *VecTy = (State.VF.isScalar())
4198                     ? PN->getType()
4199                     : VectorType::get(PN->getType(), State.VF);
4200   Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4201   State.set(PhiR, VecPhi, 0);
4202 }
4203 
4204 /// A helper function for checking whether an integer division-related
4205 /// instruction may divide by zero (in which case it must be predicated if
4206 /// executed conditionally in the scalar code).
4207 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4208 /// Non-zero divisors that are non compile-time constants will not be
4209 /// converted into multiplication, so we will still end up scalarizing
4210 /// the division, but can do so w/o predication.
4211 static bool mayDivideByZero(Instruction &I) {
4212   assert((I.getOpcode() == Instruction::UDiv ||
4213           I.getOpcode() == Instruction::SDiv ||
4214           I.getOpcode() == Instruction::URem ||
4215           I.getOpcode() == Instruction::SRem) &&
4216          "Unexpected instruction");
4217   Value *Divisor = I.getOperand(1);
4218   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4219   return !CInt || CInt->isZero();
4220 }
4221 
4222 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
4223                                                VPUser &ArgOperands,
4224                                                VPTransformState &State) {
4225   assert(!isa<DbgInfoIntrinsic>(I) &&
4226          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4227   setDebugLocFromInst(&I);
4228 
4229   Module *M = I.getParent()->getParent()->getParent();
4230   auto *CI = cast<CallInst>(&I);
4231 
4232   SmallVector<Type *, 4> Tys;
4233   for (Value *ArgOperand : CI->args())
4234     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4235 
4236   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4237 
4238   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4239   // version of the instruction.
4240   // Is it beneficial to perform intrinsic call compared to lib call?
4241   bool NeedToScalarize = false;
4242   InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4243   InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0;
4244   bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
4245   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4246          "Instruction should be scalarized elsewhere.");
4247   assert((IntrinsicCost.isValid() || CallCost.isValid()) &&
4248          "Either the intrinsic cost or vector call cost must be valid");
4249 
4250   for (unsigned Part = 0; Part < UF; ++Part) {
4251     SmallVector<Type *, 2> TysForDecl = {CI->getType()};
4252     SmallVector<Value *, 4> Args;
4253     for (auto &I : enumerate(ArgOperands.operands())) {
4254       // Some intrinsics have a scalar argument - don't replace it with a
4255       // vector.
4256       Value *Arg;
4257       if (!UseVectorIntrinsic ||
4258           !isVectorIntrinsicWithScalarOpAtArg(ID, I.index()))
4259         Arg = State.get(I.value(), Part);
4260       else
4261         Arg = State.get(I.value(), VPIteration(0, 0));
4262       if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I.index()))
4263         TysForDecl.push_back(Arg->getType());
4264       Args.push_back(Arg);
4265     }
4266 
4267     Function *VectorF;
4268     if (UseVectorIntrinsic) {
4269       // Use vector version of the intrinsic.
4270       if (VF.isVector())
4271         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4272       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4273       assert(VectorF && "Can't retrieve vector intrinsic.");
4274     } else {
4275       // Use vector version of the function call.
4276       const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
4277 #ifndef NDEBUG
4278       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
4279              "Can't create vector function.");
4280 #endif
4281         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4282     }
4283       SmallVector<OperandBundleDef, 1> OpBundles;
4284       CI->getOperandBundlesAsDefs(OpBundles);
4285       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4286 
4287       if (isa<FPMathOperator>(V))
4288         V->copyFastMathFlags(CI);
4289 
4290       State.set(Def, V, Part);
4291       addMetadata(V, &I);
4292   }
4293 }
4294 
4295 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4296   // We should not collect Scalars more than once per VF. Right now, this
4297   // function is called from collectUniformsAndScalars(), which already does
4298   // this check. Collecting Scalars for VF=1 does not make any sense.
4299   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
4300          "This function should not be visited twice for the same VF");
4301 
4302   // This avoids any chances of creating a REPLICATE recipe during planning
4303   // since that would result in generation of scalarized code during execution,
4304   // which is not supported for scalable vectors.
4305   if (VF.isScalable()) {
4306     Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
4307     return;
4308   }
4309 
4310   SmallSetVector<Instruction *, 8> Worklist;
4311 
4312   // These sets are used to seed the analysis with pointers used by memory
4313   // accesses that will remain scalar.
4314   SmallSetVector<Instruction *, 8> ScalarPtrs;
4315   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4316   auto *Latch = TheLoop->getLoopLatch();
4317 
4318   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4319   // The pointer operands of loads and stores will be scalar as long as the
4320   // memory access is not a gather or scatter operation. The value operand of a
4321   // store will remain scalar if the store is scalarized.
4322   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4323     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4324     assert(WideningDecision != CM_Unknown &&
4325            "Widening decision should be ready at this moment");
4326     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4327       if (Ptr == Store->getValueOperand())
4328         return WideningDecision == CM_Scalarize;
4329     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4330            "Ptr is neither a value or pointer operand");
4331     return WideningDecision != CM_GatherScatter;
4332   };
4333 
4334   // A helper that returns true if the given value is a bitcast or
4335   // getelementptr instruction contained in the loop.
4336   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4337     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4338             isa<GetElementPtrInst>(V)) &&
4339            !TheLoop->isLoopInvariant(V);
4340   };
4341 
4342   // A helper that evaluates a memory access's use of a pointer. If the use will
4343   // be a scalar use and the pointer is only used by memory accesses, we place
4344   // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4345   // PossibleNonScalarPtrs.
4346   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4347     // We only care about bitcast and getelementptr instructions contained in
4348     // the loop.
4349     if (!isLoopVaryingBitCastOrGEP(Ptr))
4350       return;
4351 
4352     // If the pointer has already been identified as scalar (e.g., if it was
4353     // also identified as uniform), there's nothing to do.
4354     auto *I = cast<Instruction>(Ptr);
4355     if (Worklist.count(I))
4356       return;
4357 
4358     // If the use of the pointer will be a scalar use, and all users of the
4359     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4360     // place the pointer in PossibleNonScalarPtrs.
4361     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4362           return isa<LoadInst>(U) || isa<StoreInst>(U);
4363         }))
4364       ScalarPtrs.insert(I);
4365     else
4366       PossibleNonScalarPtrs.insert(I);
4367   };
4368 
4369   // We seed the scalars analysis with three classes of instructions: (1)
4370   // instructions marked uniform-after-vectorization and (2) bitcast,
4371   // getelementptr and (pointer) phi instructions used by memory accesses
4372   // requiring a scalar use.
4373   //
4374   // (1) Add to the worklist all instructions that have been identified as
4375   // uniform-after-vectorization.
4376   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4377 
4378   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4379   // memory accesses requiring a scalar use. The pointer operands of loads and
4380   // stores will be scalar as long as the memory accesses is not a gather or
4381   // scatter operation. The value operand of a store will remain scalar if the
4382   // store is scalarized.
4383   for (auto *BB : TheLoop->blocks())
4384     for (auto &I : *BB) {
4385       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4386         evaluatePtrUse(Load, Load->getPointerOperand());
4387       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4388         evaluatePtrUse(Store, Store->getPointerOperand());
4389         evaluatePtrUse(Store, Store->getValueOperand());
4390       }
4391     }
4392   for (auto *I : ScalarPtrs)
4393     if (!PossibleNonScalarPtrs.count(I)) {
4394       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4395       Worklist.insert(I);
4396     }
4397 
4398   // Insert the forced scalars.
4399   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4400   // induction variable when the PHI user is scalarized.
4401   auto ForcedScalar = ForcedScalars.find(VF);
4402   if (ForcedScalar != ForcedScalars.end())
4403     for (auto *I : ForcedScalar->second)
4404       Worklist.insert(I);
4405 
4406   // Expand the worklist by looking through any bitcasts and getelementptr
4407   // instructions we've already identified as scalar. This is similar to the
4408   // expansion step in collectLoopUniforms(); however, here we're only
4409   // expanding to include additional bitcasts and getelementptr instructions.
4410   unsigned Idx = 0;
4411   while (Idx != Worklist.size()) {
4412     Instruction *Dst = Worklist[Idx++];
4413     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4414       continue;
4415     auto *Src = cast<Instruction>(Dst->getOperand(0));
4416     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4417           auto *J = cast<Instruction>(U);
4418           return !TheLoop->contains(J) || Worklist.count(J) ||
4419                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4420                   isScalarUse(J, Src));
4421         })) {
4422       Worklist.insert(Src);
4423       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4424     }
4425   }
4426 
4427   // An induction variable will remain scalar if all users of the induction
4428   // variable and induction variable update remain scalar.
4429   for (auto &Induction : Legal->getInductionVars()) {
4430     auto *Ind = Induction.first;
4431     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4432 
4433     // If tail-folding is applied, the primary induction variable will be used
4434     // to feed a vector compare.
4435     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4436       continue;
4437 
4438     // Returns true if \p Indvar is a pointer induction that is used directly by
4439     // load/store instruction \p I.
4440     auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
4441                                               Instruction *I) {
4442       return Induction.second.getKind() ==
4443                  InductionDescriptor::IK_PtrInduction &&
4444              (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
4445              Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
4446     };
4447 
4448     // Determine if all users of the induction variable are scalar after
4449     // vectorization.
4450     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4451       auto *I = cast<Instruction>(U);
4452       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4453              IsDirectLoadStoreFromPtrIndvar(Ind, I);
4454     });
4455     if (!ScalarInd)
4456       continue;
4457 
4458     // Determine if all users of the induction variable update instruction are
4459     // scalar after vectorization.
4460     auto ScalarIndUpdate =
4461         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4462           auto *I = cast<Instruction>(U);
4463           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4464                  IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
4465         });
4466     if (!ScalarIndUpdate)
4467       continue;
4468 
4469     // The induction variable and its update instruction will remain scalar.
4470     Worklist.insert(Ind);
4471     Worklist.insert(IndUpdate);
4472     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4473     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4474                       << "\n");
4475   }
4476 
4477   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4478 }
4479 
4480 bool LoopVectorizationCostModel::isScalarWithPredication(
4481     Instruction *I, ElementCount VF) const {
4482   if (!blockNeedsPredicationForAnyReason(I->getParent()))
4483     return false;
4484   switch(I->getOpcode()) {
4485   default:
4486     break;
4487   case Instruction::Load:
4488   case Instruction::Store: {
4489     if (!Legal->isMaskRequired(I))
4490       return false;
4491     auto *Ptr = getLoadStorePointerOperand(I);
4492     auto *Ty = getLoadStoreType(I);
4493     Type *VTy = Ty;
4494     if (VF.isVector())
4495       VTy = VectorType::get(Ty, VF);
4496     const Align Alignment = getLoadStoreAlignment(I);
4497     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4498                                 TTI.isLegalMaskedGather(VTy, Alignment))
4499                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4500                                 TTI.isLegalMaskedScatter(VTy, Alignment));
4501   }
4502   case Instruction::UDiv:
4503   case Instruction::SDiv:
4504   case Instruction::SRem:
4505   case Instruction::URem:
4506     return mayDivideByZero(*I);
4507   }
4508   return false;
4509 }
4510 
4511 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4512     Instruction *I, ElementCount VF) {
4513   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4514   assert(getWideningDecision(I, VF) == CM_Unknown &&
4515          "Decision should not be set yet.");
4516   auto *Group = getInterleavedAccessGroup(I);
4517   assert(Group && "Must have a group.");
4518 
4519   // If the instruction's allocated size doesn't equal it's type size, it
4520   // requires padding and will be scalarized.
4521   auto &DL = I->getModule()->getDataLayout();
4522   auto *ScalarTy = getLoadStoreType(I);
4523   if (hasIrregularType(ScalarTy, DL))
4524     return false;
4525 
4526   // If the group involves a non-integral pointer, we may not be able to
4527   // losslessly cast all values to a common type.
4528   unsigned InterleaveFactor = Group->getFactor();
4529   bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
4530   for (unsigned i = 0; i < InterleaveFactor; i++) {
4531     Instruction *Member = Group->getMember(i);
4532     if (!Member)
4533       continue;
4534     auto *MemberTy = getLoadStoreType(Member);
4535     bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
4536     // Don't coerce non-integral pointers to integers or vice versa.
4537     if (MemberNI != ScalarNI) {
4538       // TODO: Consider adding special nullptr value case here
4539       return false;
4540     } else if (MemberNI && ScalarNI &&
4541                ScalarTy->getPointerAddressSpace() !=
4542                MemberTy->getPointerAddressSpace()) {
4543       return false;
4544     }
4545   }
4546 
4547   // Check if masking is required.
4548   // A Group may need masking for one of two reasons: it resides in a block that
4549   // needs predication, or it was decided to use masking to deal with gaps
4550   // (either a gap at the end of a load-access that may result in a speculative
4551   // load, or any gaps in a store-access).
4552   bool PredicatedAccessRequiresMasking =
4553       blockNeedsPredicationForAnyReason(I->getParent()) &&
4554       Legal->isMaskRequired(I);
4555   bool LoadAccessWithGapsRequiresEpilogMasking =
4556       isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
4557       !isScalarEpilogueAllowed();
4558   bool StoreAccessWithGapsRequiresMasking =
4559       isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
4560   if (!PredicatedAccessRequiresMasking &&
4561       !LoadAccessWithGapsRequiresEpilogMasking &&
4562       !StoreAccessWithGapsRequiresMasking)
4563     return true;
4564 
4565   // If masked interleaving is required, we expect that the user/target had
4566   // enabled it, because otherwise it either wouldn't have been created or
4567   // it should have been invalidated by the CostModel.
4568   assert(useMaskedInterleavedAccesses(TTI) &&
4569          "Masked interleave-groups for predicated accesses are not enabled.");
4570 
4571   if (Group->isReverse())
4572     return false;
4573 
4574   auto *Ty = getLoadStoreType(I);
4575   const Align Alignment = getLoadStoreAlignment(I);
4576   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4577                           : TTI.isLegalMaskedStore(Ty, Alignment);
4578 }
4579 
4580 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4581     Instruction *I, ElementCount VF) {
4582   // Get and ensure we have a valid memory instruction.
4583   assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
4584 
4585   auto *Ptr = getLoadStorePointerOperand(I);
4586   auto *ScalarTy = getLoadStoreType(I);
4587 
4588   // In order to be widened, the pointer should be consecutive, first of all.
4589   if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
4590     return false;
4591 
4592   // If the instruction is a store located in a predicated block, it will be
4593   // scalarized.
4594   if (isScalarWithPredication(I, VF))
4595     return false;
4596 
4597   // If the instruction's allocated size doesn't equal it's type size, it
4598   // requires padding and will be scalarized.
4599   auto &DL = I->getModule()->getDataLayout();
4600   if (hasIrregularType(ScalarTy, DL))
4601     return false;
4602 
4603   return true;
4604 }
4605 
4606 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
4607   // We should not collect Uniforms more than once per VF. Right now,
4608   // this function is called from collectUniformsAndScalars(), which
4609   // already does this check. Collecting Uniforms for VF=1 does not make any
4610   // sense.
4611 
4612   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
4613          "This function should not be visited twice for the same VF");
4614 
4615   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4616   // not analyze again.  Uniforms.count(VF) will return 1.
4617   Uniforms[VF].clear();
4618 
4619   // We now know that the loop is vectorizable!
4620   // Collect instructions inside the loop that will remain uniform after
4621   // vectorization.
4622 
4623   // Global values, params and instructions outside of current loop are out of
4624   // scope.
4625   auto isOutOfScope = [&](Value *V) -> bool {
4626     Instruction *I = dyn_cast<Instruction>(V);
4627     return (!I || !TheLoop->contains(I));
4628   };
4629 
4630   // Worklist containing uniform instructions demanding lane 0.
4631   SetVector<Instruction *> Worklist;
4632   BasicBlock *Latch = TheLoop->getLoopLatch();
4633 
4634   // Add uniform instructions demanding lane 0 to the worklist. Instructions
4635   // that are scalar with predication must not be considered uniform after
4636   // vectorization, because that would create an erroneous replicating region
4637   // where only a single instance out of VF should be formed.
4638   // TODO: optimize such seldom cases if found important, see PR40816.
4639   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4640     if (isOutOfScope(I)) {
4641       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
4642                         << *I << "\n");
4643       return;
4644     }
4645     if (isScalarWithPredication(I, VF)) {
4646       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4647                         << *I << "\n");
4648       return;
4649     }
4650     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4651     Worklist.insert(I);
4652   };
4653 
4654   // Start with the conditional branch. If the branch condition is an
4655   // instruction contained in the loop that is only used by the branch, it is
4656   // uniform.
4657   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4658   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4659     addToWorklistIfAllowed(Cmp);
4660 
4661   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
4662     InstWidening WideningDecision = getWideningDecision(I, VF);
4663     assert(WideningDecision != CM_Unknown &&
4664            "Widening decision should be ready at this moment");
4665 
4666     // A uniform memory op is itself uniform.  We exclude uniform stores
4667     // here as they demand the last lane, not the first one.
4668     if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
4669       assert(WideningDecision == CM_Scalarize);
4670       return true;
4671     }
4672 
4673     return (WideningDecision == CM_Widen ||
4674             WideningDecision == CM_Widen_Reverse ||
4675             WideningDecision == CM_Interleave);
4676   };
4677 
4678 
4679   // Returns true if Ptr is the pointer operand of a memory access instruction
4680   // I, and I is known to not require scalarization.
4681   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4682     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4683   };
4684 
4685   // Holds a list of values which are known to have at least one uniform use.
4686   // Note that there may be other uses which aren't uniform.  A "uniform use"
4687   // here is something which only demands lane 0 of the unrolled iterations;
4688   // it does not imply that all lanes produce the same value (e.g. this is not
4689   // the usual meaning of uniform)
4690   SetVector<Value *> HasUniformUse;
4691 
4692   // Scan the loop for instructions which are either a) known to have only
4693   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
4694   for (auto *BB : TheLoop->blocks())
4695     for (auto &I : *BB) {
4696       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
4697         switch (II->getIntrinsicID()) {
4698         case Intrinsic::sideeffect:
4699         case Intrinsic::experimental_noalias_scope_decl:
4700         case Intrinsic::assume:
4701         case Intrinsic::lifetime_start:
4702         case Intrinsic::lifetime_end:
4703           if (TheLoop->hasLoopInvariantOperands(&I))
4704             addToWorklistIfAllowed(&I);
4705           break;
4706         default:
4707           break;
4708         }
4709       }
4710 
4711       // ExtractValue instructions must be uniform, because the operands are
4712       // known to be loop-invariant.
4713       if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
4714         assert(isOutOfScope(EVI->getAggregateOperand()) &&
4715                "Expected aggregate value to be loop invariant");
4716         addToWorklistIfAllowed(EVI);
4717         continue;
4718       }
4719 
4720       // If there's no pointer operand, there's nothing to do.
4721       auto *Ptr = getLoadStorePointerOperand(&I);
4722       if (!Ptr)
4723         continue;
4724 
4725       // A uniform memory op is itself uniform.  We exclude uniform stores
4726       // here as they demand the last lane, not the first one.
4727       if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
4728         addToWorklistIfAllowed(&I);
4729 
4730       if (isUniformDecision(&I, VF)) {
4731         assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
4732         HasUniformUse.insert(Ptr);
4733       }
4734     }
4735 
4736   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
4737   // demanding) users.  Since loops are assumed to be in LCSSA form, this
4738   // disallows uses outside the loop as well.
4739   for (auto *V : HasUniformUse) {
4740     if (isOutOfScope(V))
4741       continue;
4742     auto *I = cast<Instruction>(V);
4743     auto UsersAreMemAccesses =
4744       llvm::all_of(I->users(), [&](User *U) -> bool {
4745         return isVectorizedMemAccessUse(cast<Instruction>(U), V);
4746       });
4747     if (UsersAreMemAccesses)
4748       addToWorklistIfAllowed(I);
4749   }
4750 
4751   // Expand Worklist in topological order: whenever a new instruction
4752   // is added , its users should be already inside Worklist.  It ensures
4753   // a uniform instruction will only be used by uniform instructions.
4754   unsigned idx = 0;
4755   while (idx != Worklist.size()) {
4756     Instruction *I = Worklist[idx++];
4757 
4758     for (auto OV : I->operand_values()) {
4759       // isOutOfScope operands cannot be uniform instructions.
4760       if (isOutOfScope(OV))
4761         continue;
4762       // First order recurrence Phi's should typically be considered
4763       // non-uniform.
4764       auto *OP = dyn_cast<PHINode>(OV);
4765       if (OP && Legal->isFirstOrderRecurrence(OP))
4766         continue;
4767       // If all the users of the operand are uniform, then add the
4768       // operand into the uniform worklist.
4769       auto *OI = cast<Instruction>(OV);
4770       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4771             auto *J = cast<Instruction>(U);
4772             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
4773           }))
4774         addToWorklistIfAllowed(OI);
4775     }
4776   }
4777 
4778   // For an instruction to be added into Worklist above, all its users inside
4779   // the loop should also be in Worklist. However, this condition cannot be
4780   // true for phi nodes that form a cyclic dependence. We must process phi
4781   // nodes separately. An induction variable will remain uniform if all users
4782   // of the induction variable and induction variable update remain uniform.
4783   // The code below handles both pointer and non-pointer induction variables.
4784   for (auto &Induction : Legal->getInductionVars()) {
4785     auto *Ind = Induction.first;
4786     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4787 
4788     // Determine if all users of the induction variable are uniform after
4789     // vectorization.
4790     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4791       auto *I = cast<Instruction>(U);
4792       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4793              isVectorizedMemAccessUse(I, Ind);
4794     });
4795     if (!UniformInd)
4796       continue;
4797 
4798     // Determine if all users of the induction variable update instruction are
4799     // uniform after vectorization.
4800     auto UniformIndUpdate =
4801         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4802           auto *I = cast<Instruction>(U);
4803           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4804                  isVectorizedMemAccessUse(I, IndUpdate);
4805         });
4806     if (!UniformIndUpdate)
4807       continue;
4808 
4809     // The induction variable and its update instruction will remain uniform.
4810     addToWorklistIfAllowed(Ind);
4811     addToWorklistIfAllowed(IndUpdate);
4812   }
4813 
4814   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4815 }
4816 
4817 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4818   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4819 
4820   if (Legal->getRuntimePointerChecking()->Need) {
4821     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4822         "runtime pointer checks needed. Enable vectorization of this "
4823         "loop with '#pragma clang loop vectorize(enable)' when "
4824         "compiling with -Os/-Oz",
4825         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4826     return true;
4827   }
4828 
4829   if (!PSE.getPredicate().isAlwaysTrue()) {
4830     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4831         "runtime SCEV checks needed. Enable vectorization of this "
4832         "loop with '#pragma clang loop vectorize(enable)' when "
4833         "compiling with -Os/-Oz",
4834         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4835     return true;
4836   }
4837 
4838   // FIXME: Avoid specializing for stride==1 instead of bailing out.
4839   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4840     reportVectorizationFailure("Runtime stride check for small trip count",
4841         "runtime stride == 1 checks needed. Enable vectorization of "
4842         "this loop without such check by compiling with -Os/-Oz",
4843         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4844     return true;
4845   }
4846 
4847   return false;
4848 }
4849 
4850 ElementCount
4851 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
4852   if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
4853     return ElementCount::getScalable(0);
4854 
4855   if (Hints->isScalableVectorizationDisabled()) {
4856     reportVectorizationInfo("Scalable vectorization is explicitly disabled",
4857                             "ScalableVectorizationDisabled", ORE, TheLoop);
4858     return ElementCount::getScalable(0);
4859   }
4860 
4861   LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
4862 
4863   auto MaxScalableVF = ElementCount::getScalable(
4864       std::numeric_limits<ElementCount::ScalarTy>::max());
4865 
4866   // Test that the loop-vectorizer can legalize all operations for this MaxVF.
4867   // FIXME: While for scalable vectors this is currently sufficient, this should
4868   // be replaced by a more detailed mechanism that filters out specific VFs,
4869   // instead of invalidating vectorization for a whole set of VFs based on the
4870   // MaxVF.
4871 
4872   // Disable scalable vectorization if the loop contains unsupported reductions.
4873   if (!canVectorizeReductions(MaxScalableVF)) {
4874     reportVectorizationInfo(
4875         "Scalable vectorization not supported for the reduction "
4876         "operations found in this loop.",
4877         "ScalableVFUnfeasible", ORE, TheLoop);
4878     return ElementCount::getScalable(0);
4879   }
4880 
4881   // Disable scalable vectorization if the loop contains any instructions
4882   // with element types not supported for scalable vectors.
4883   if (any_of(ElementTypesInLoop, [&](Type *Ty) {
4884         return !Ty->isVoidTy() &&
4885                !this->TTI.isElementTypeLegalForScalableVector(Ty);
4886       })) {
4887     reportVectorizationInfo("Scalable vectorization is not supported "
4888                             "for all element types found in this loop.",
4889                             "ScalableVFUnfeasible", ORE, TheLoop);
4890     return ElementCount::getScalable(0);
4891   }
4892 
4893   if (Legal->isSafeForAnyVectorWidth())
4894     return MaxScalableVF;
4895 
4896   // Limit MaxScalableVF by the maximum safe dependence distance.
4897   Optional<unsigned> MaxVScale = TTI.getMaxVScale();
4898   if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange))
4899     MaxVScale =
4900         TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
4901   MaxScalableVF = ElementCount::getScalable(
4902       MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
4903   if (!MaxScalableVF)
4904     reportVectorizationInfo(
4905         "Max legal vector width too small, scalable vectorization "
4906         "unfeasible.",
4907         "ScalableVFUnfeasible", ORE, TheLoop);
4908 
4909   return MaxScalableVF;
4910 }
4911 
4912 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
4913     unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) {
4914   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4915   unsigned SmallestType, WidestType;
4916   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4917 
4918   // Get the maximum safe dependence distance in bits computed by LAA.
4919   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4920   // the memory accesses that is most restrictive (involved in the smallest
4921   // dependence distance).
4922   unsigned MaxSafeElements =
4923       PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
4924 
4925   auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
4926   auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
4927 
4928   LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
4929                     << ".\n");
4930   LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
4931                     << ".\n");
4932 
4933   // First analyze the UserVF, fall back if the UserVF should be ignored.
4934   if (UserVF) {
4935     auto MaxSafeUserVF =
4936         UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
4937 
4938     if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
4939       // If `VF=vscale x N` is safe, then so is `VF=N`
4940       if (UserVF.isScalable())
4941         return FixedScalableVFPair(
4942             ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
4943       else
4944         return UserVF;
4945     }
4946 
4947     assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
4948 
4949     // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
4950     // is better to ignore the hint and let the compiler choose a suitable VF.
4951     if (!UserVF.isScalable()) {
4952       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4953                         << " is unsafe, clamping to max safe VF="
4954                         << MaxSafeFixedVF << ".\n");
4955       ORE->emit([&]() {
4956         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4957                                           TheLoop->getStartLoc(),
4958                                           TheLoop->getHeader())
4959                << "User-specified vectorization factor "
4960                << ore::NV("UserVectorizationFactor", UserVF)
4961                << " is unsafe, clamping to maximum safe vectorization factor "
4962                << ore::NV("VectorizationFactor", MaxSafeFixedVF);
4963       });
4964       return MaxSafeFixedVF;
4965     }
4966 
4967     if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
4968       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4969                         << " is ignored because scalable vectors are not "
4970                            "available.\n");
4971       ORE->emit([&]() {
4972         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4973                                           TheLoop->getStartLoc(),
4974                                           TheLoop->getHeader())
4975                << "User-specified vectorization factor "
4976                << ore::NV("UserVectorizationFactor", UserVF)
4977                << " is ignored because the target does not support scalable "
4978                   "vectors. The compiler will pick a more suitable value.";
4979       });
4980     } else {
4981       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4982                         << " is unsafe. Ignoring scalable UserVF.\n");
4983       ORE->emit([&]() {
4984         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4985                                           TheLoop->getStartLoc(),
4986                                           TheLoop->getHeader())
4987                << "User-specified vectorization factor "
4988                << ore::NV("UserVectorizationFactor", UserVF)
4989                << " is unsafe. Ignoring the hint to let the compiler pick a "
4990                   "more suitable value.";
4991       });
4992     }
4993   }
4994 
4995   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4996                     << " / " << WidestType << " bits.\n");
4997 
4998   FixedScalableVFPair Result(ElementCount::getFixed(1),
4999                              ElementCount::getScalable(0));
5000   if (auto MaxVF =
5001           getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5002                                   MaxSafeFixedVF, FoldTailByMasking))
5003     Result.FixedVF = MaxVF;
5004 
5005   if (auto MaxVF =
5006           getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5007                                   MaxSafeScalableVF, FoldTailByMasking))
5008     if (MaxVF.isScalable()) {
5009       Result.ScalableVF = MaxVF;
5010       LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
5011                         << "\n");
5012     }
5013 
5014   return Result;
5015 }
5016 
5017 FixedScalableVFPair
5018 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5019   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5020     // TODO: It may by useful to do since it's still likely to be dynamically
5021     // uniform if the target can skip.
5022     reportVectorizationFailure(
5023         "Not inserting runtime ptr check for divergent target",
5024         "runtime pointer checks needed. Not enabled for divergent target",
5025         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5026     return FixedScalableVFPair::getNone();
5027   }
5028 
5029   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5030   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5031   if (TC == 1) {
5032     reportVectorizationFailure("Single iteration (non) loop",
5033         "loop trip count is one, irrelevant for vectorization",
5034         "SingleIterationLoop", ORE, TheLoop);
5035     return FixedScalableVFPair::getNone();
5036   }
5037 
5038   switch (ScalarEpilogueStatus) {
5039   case CM_ScalarEpilogueAllowed:
5040     return computeFeasibleMaxVF(TC, UserVF, false);
5041   case CM_ScalarEpilogueNotAllowedUsePredicate:
5042     LLVM_FALLTHROUGH;
5043   case CM_ScalarEpilogueNotNeededUsePredicate:
5044     LLVM_DEBUG(
5045         dbgs() << "LV: vector predicate hint/switch found.\n"
5046                << "LV: Not allowing scalar epilogue, creating predicated "
5047                << "vector loop.\n");
5048     break;
5049   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5050     // fallthrough as a special case of OptForSize
5051   case CM_ScalarEpilogueNotAllowedOptSize:
5052     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5053       LLVM_DEBUG(
5054           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5055     else
5056       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5057                         << "count.\n");
5058 
5059     // Bail if runtime checks are required, which are not good when optimising
5060     // for size.
5061     if (runtimeChecksRequired())
5062       return FixedScalableVFPair::getNone();
5063 
5064     break;
5065   }
5066 
5067   // The only loops we can vectorize without a scalar epilogue, are loops with
5068   // a bottom-test and a single exiting block. We'd have to handle the fact
5069   // that not every instruction executes on the last iteration.  This will
5070   // require a lane mask which varies through the vector loop body.  (TODO)
5071   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
5072     // If there was a tail-folding hint/switch, but we can't fold the tail by
5073     // masking, fallback to a vectorization with a scalar epilogue.
5074     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5075       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5076                            "scalar epilogue instead.\n");
5077       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5078       return computeFeasibleMaxVF(TC, UserVF, false);
5079     }
5080     return FixedScalableVFPair::getNone();
5081   }
5082 
5083   // Now try the tail folding
5084 
5085   // Invalidate interleave groups that require an epilogue if we can't mask
5086   // the interleave-group.
5087   if (!useMaskedInterleavedAccesses(TTI)) {
5088     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5089            "No decisions should have been taken at this point");
5090     // Note: There is no need to invalidate any cost modeling decisions here, as
5091     // non where taken so far.
5092     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5093   }
5094 
5095   FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true);
5096   // Avoid tail folding if the trip count is known to be a multiple of any VF
5097   // we chose.
5098   // FIXME: The condition below pessimises the case for fixed-width vectors,
5099   // when scalable VFs are also candidates for vectorization.
5100   if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) {
5101     ElementCount MaxFixedVF = MaxFactors.FixedVF;
5102     assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) &&
5103            "MaxFixedVF must be a power of 2");
5104     unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC
5105                                    : MaxFixedVF.getFixedValue();
5106     ScalarEvolution *SE = PSE.getSE();
5107     const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
5108     const SCEV *ExitCount = SE->getAddExpr(
5109         BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
5110     const SCEV *Rem = SE->getURemExpr(
5111         SE->applyLoopGuards(ExitCount, TheLoop),
5112         SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
5113     if (Rem->isZero()) {
5114       // Accept MaxFixedVF if we do not have a tail.
5115       LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5116       return MaxFactors;
5117     }
5118   }
5119 
5120   // If we don't know the precise trip count, or if the trip count that we
5121   // found modulo the vectorization factor is not zero, try to fold the tail
5122   // by masking.
5123   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5124   if (Legal->prepareToFoldTailByMasking()) {
5125     FoldTailByMasking = true;
5126     return MaxFactors;
5127   }
5128 
5129   // If there was a tail-folding hint/switch, but we can't fold the tail by
5130   // masking, fallback to a vectorization with a scalar epilogue.
5131   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5132     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5133                          "scalar epilogue instead.\n");
5134     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5135     return MaxFactors;
5136   }
5137 
5138   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
5139     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5140     return FixedScalableVFPair::getNone();
5141   }
5142 
5143   if (TC == 0) {
5144     reportVectorizationFailure(
5145         "Unable to calculate the loop count due to complex control flow",
5146         "unable to calculate the loop count due to complex control flow",
5147         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5148     return FixedScalableVFPair::getNone();
5149   }
5150 
5151   reportVectorizationFailure(
5152       "Cannot optimize for size and vectorize at the same time.",
5153       "cannot optimize for size and vectorize at the same time. "
5154       "Enable vectorization of this loop with '#pragma clang loop "
5155       "vectorize(enable)' when compiling with -Os/-Oz",
5156       "NoTailLoopWithOptForSize", ORE, TheLoop);
5157   return FixedScalableVFPair::getNone();
5158 }
5159 
5160 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
5161     unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
5162     ElementCount MaxSafeVF, bool FoldTailByMasking) {
5163   bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
5164   TypeSize WidestRegister = TTI.getRegisterBitWidth(
5165       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5166                            : TargetTransformInfo::RGK_FixedWidthVector);
5167 
5168   // Convenience function to return the minimum of two ElementCounts.
5169   auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
5170     assert((LHS.isScalable() == RHS.isScalable()) &&
5171            "Scalable flags must match");
5172     return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
5173   };
5174 
5175   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5176   // Note that both WidestRegister and WidestType may not be a powers of 2.
5177   auto MaxVectorElementCount = ElementCount::get(
5178       PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType),
5179       ComputeScalableMaxVF);
5180   MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
5181   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5182                     << (MaxVectorElementCount * WidestType) << " bits.\n");
5183 
5184   if (!MaxVectorElementCount) {
5185     LLVM_DEBUG(dbgs() << "LV: The target has no "
5186                       << (ComputeScalableMaxVF ? "scalable" : "fixed")
5187                       << " vector registers.\n");
5188     return ElementCount::getFixed(1);
5189   }
5190 
5191   const auto TripCountEC = ElementCount::getFixed(ConstTripCount);
5192   if (ConstTripCount &&
5193       ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) &&
5194       (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) {
5195     // If loop trip count (TC) is known at compile time there is no point in
5196     // choosing VF greater than TC (as done in the loop below). Select maximum
5197     // power of two which doesn't exceed TC.
5198     // If MaxVectorElementCount is scalable, we only fall back on a fixed VF
5199     // when the TC is less than or equal to the known number of lanes.
5200     auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount);
5201     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
5202                          "exceeding the constant trip count: "
5203                       << ClampedConstTripCount << "\n");
5204     return ElementCount::getFixed(ClampedConstTripCount);
5205   }
5206 
5207   TargetTransformInfo::RegisterKind RegKind =
5208       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5209                            : TargetTransformInfo::RGK_FixedWidthVector;
5210   ElementCount MaxVF = MaxVectorElementCount;
5211   if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
5212                             TTI.shouldMaximizeVectorBandwidth(RegKind))) {
5213     auto MaxVectorElementCountMaxBW = ElementCount::get(
5214         PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType),
5215         ComputeScalableMaxVF);
5216     MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
5217 
5218     // Collect all viable vectorization factors larger than the default MaxVF
5219     // (i.e. MaxVectorElementCount).
5220     SmallVector<ElementCount, 8> VFs;
5221     for (ElementCount VS = MaxVectorElementCount * 2;
5222          ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
5223       VFs.push_back(VS);
5224 
5225     // For each VF calculate its register usage.
5226     auto RUs = calculateRegisterUsage(VFs);
5227 
5228     // Select the largest VF which doesn't require more registers than existing
5229     // ones.
5230     for (int i = RUs.size() - 1; i >= 0; --i) {
5231       bool Selected = true;
5232       for (auto &pair : RUs[i].MaxLocalUsers) {
5233         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5234         if (pair.second > TargetNumRegisters)
5235           Selected = false;
5236       }
5237       if (Selected) {
5238         MaxVF = VFs[i];
5239         break;
5240       }
5241     }
5242     if (ElementCount MinVF =
5243             TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
5244       if (ElementCount::isKnownLT(MaxVF, MinVF)) {
5245         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5246                           << ") with target's minimum: " << MinVF << '\n');
5247         MaxVF = MinVF;
5248       }
5249     }
5250 
5251     // Invalidate any widening decisions we might have made, in case the loop
5252     // requires prediction (decided later), but we have already made some
5253     // load/store widening decisions.
5254     invalidateCostModelingDecisions();
5255   }
5256   return MaxVF;
5257 }
5258 
5259 Optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const {
5260   if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
5261     auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
5262     auto Min = Attr.getVScaleRangeMin();
5263     auto Max = Attr.getVScaleRangeMax();
5264     if (Max && Min == Max)
5265       return Max;
5266   }
5267 
5268   return TTI.getVScaleForTuning();
5269 }
5270 
5271 bool LoopVectorizationCostModel::isMoreProfitable(
5272     const VectorizationFactor &A, const VectorizationFactor &B) const {
5273   InstructionCost CostA = A.Cost;
5274   InstructionCost CostB = B.Cost;
5275 
5276   unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
5277 
5278   if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking &&
5279       MaxTripCount) {
5280     // If we are folding the tail and the trip count is a known (possibly small)
5281     // constant, the trip count will be rounded up to an integer number of
5282     // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF),
5283     // which we compare directly. When not folding the tail, the total cost will
5284     // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is
5285     // approximated with the per-lane cost below instead of using the tripcount
5286     // as here.
5287     auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue());
5288     auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue());
5289     return RTCostA < RTCostB;
5290   }
5291 
5292   // Improve estimate for the vector width if it is scalable.
5293   unsigned EstimatedWidthA = A.Width.getKnownMinValue();
5294   unsigned EstimatedWidthB = B.Width.getKnownMinValue();
5295   if (Optional<unsigned> VScale = getVScaleForTuning()) {
5296     if (A.Width.isScalable())
5297       EstimatedWidthA *= VScale.getValue();
5298     if (B.Width.isScalable())
5299       EstimatedWidthB *= VScale.getValue();
5300   }
5301 
5302   // Assume vscale may be larger than 1 (or the value being tuned for),
5303   // so that scalable vectorization is slightly favorable over fixed-width
5304   // vectorization.
5305   if (A.Width.isScalable() && !B.Width.isScalable())
5306     return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
5307 
5308   // To avoid the need for FP division:
5309   //      (CostA / A.Width) < (CostB / B.Width)
5310   // <=>  (CostA * B.Width) < (CostB * A.Width)
5311   return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
5312 }
5313 
5314 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
5315     const ElementCountSet &VFCandidates) {
5316   InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
5317   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
5318   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
5319   assert(VFCandidates.count(ElementCount::getFixed(1)) &&
5320          "Expected Scalar VF to be a candidate");
5321 
5322   const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost);
5323   VectorizationFactor ChosenFactor = ScalarCost;
5324 
5325   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5326   if (ForceVectorization && VFCandidates.size() > 1) {
5327     // Ignore scalar width, because the user explicitly wants vectorization.
5328     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5329     // evaluation.
5330     ChosenFactor.Cost = InstructionCost::getMax();
5331   }
5332 
5333   SmallVector<InstructionVFPair> InvalidCosts;
5334   for (const auto &i : VFCandidates) {
5335     // The cost for scalar VF=1 is already calculated, so ignore it.
5336     if (i.isScalar())
5337       continue;
5338 
5339     VectorizationCostTy C = expectedCost(i, &InvalidCosts);
5340     VectorizationFactor Candidate(i, C.first);
5341 
5342 #ifndef NDEBUG
5343     unsigned AssumedMinimumVscale = 1;
5344     if (Optional<unsigned> VScale = getVScaleForTuning())
5345       AssumedMinimumVscale = VScale.getValue();
5346     unsigned Width =
5347         Candidate.Width.isScalable()
5348             ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
5349             : Candidate.Width.getFixedValue();
5350     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5351                       << " costs: " << (Candidate.Cost / Width));
5352     if (i.isScalable())
5353       LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
5354                         << AssumedMinimumVscale << ")");
5355     LLVM_DEBUG(dbgs() << ".\n");
5356 #endif
5357 
5358     if (!C.second && !ForceVectorization) {
5359       LLVM_DEBUG(
5360           dbgs() << "LV: Not considering vector loop of width " << i
5361                  << " because it will not generate any vector instructions.\n");
5362       continue;
5363     }
5364 
5365     // If profitable add it to ProfitableVF list.
5366     if (isMoreProfitable(Candidate, ScalarCost))
5367       ProfitableVFs.push_back(Candidate);
5368 
5369     if (isMoreProfitable(Candidate, ChosenFactor))
5370       ChosenFactor = Candidate;
5371   }
5372 
5373   // Emit a report of VFs with invalid costs in the loop.
5374   if (!InvalidCosts.empty()) {
5375     // Group the remarks per instruction, keeping the instruction order from
5376     // InvalidCosts.
5377     std::map<Instruction *, unsigned> Numbering;
5378     unsigned I = 0;
5379     for (auto &Pair : InvalidCosts)
5380       if (!Numbering.count(Pair.first))
5381         Numbering[Pair.first] = I++;
5382 
5383     // Sort the list, first on instruction(number) then on VF.
5384     llvm::sort(InvalidCosts,
5385                [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
5386                  if (Numbering[A.first] != Numbering[B.first])
5387                    return Numbering[A.first] < Numbering[B.first];
5388                  ElementCountComparator ECC;
5389                  return ECC(A.second, B.second);
5390                });
5391 
5392     // For a list of ordered instruction-vf pairs:
5393     //   [(load, vf1), (load, vf2), (store, vf1)]
5394     // Group the instructions together to emit separate remarks for:
5395     //   load  (vf1, vf2)
5396     //   store (vf1)
5397     auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
5398     auto Subset = ArrayRef<InstructionVFPair>();
5399     do {
5400       if (Subset.empty())
5401         Subset = Tail.take_front(1);
5402 
5403       Instruction *I = Subset.front().first;
5404 
5405       // If the next instruction is different, or if there are no other pairs,
5406       // emit a remark for the collated subset. e.g.
5407       //   [(load, vf1), (load, vf2))]
5408       // to emit:
5409       //  remark: invalid costs for 'load' at VF=(vf, vf2)
5410       if (Subset == Tail || Tail[Subset.size()].first != I) {
5411         std::string OutString;
5412         raw_string_ostream OS(OutString);
5413         assert(!Subset.empty() && "Unexpected empty range");
5414         OS << "Instruction with invalid costs prevented vectorization at VF=(";
5415         for (auto &Pair : Subset)
5416           OS << (Pair.second == Subset.front().second ? "" : ", ")
5417              << Pair.second;
5418         OS << "):";
5419         if (auto *CI = dyn_cast<CallInst>(I))
5420           OS << " call to " << CI->getCalledFunction()->getName();
5421         else
5422           OS << " " << I->getOpcodeName();
5423         OS.flush();
5424         reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
5425         Tail = Tail.drop_front(Subset.size());
5426         Subset = {};
5427       } else
5428         // Grow the subset by one element
5429         Subset = Tail.take_front(Subset.size() + 1);
5430     } while (!Tail.empty());
5431   }
5432 
5433   if (!EnableCondStoresVectorization && NumPredStores) {
5434     reportVectorizationFailure("There are conditional stores.",
5435         "store that is conditionally executed prevents vectorization",
5436         "ConditionalStore", ORE, TheLoop);
5437     ChosenFactor = ScalarCost;
5438   }
5439 
5440   LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
5441                  ChosenFactor.Cost >= ScalarCost.Cost) dbgs()
5442              << "LV: Vectorization seems to be not beneficial, "
5443              << "but was forced by a user.\n");
5444   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
5445   return ChosenFactor;
5446 }
5447 
5448 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
5449     const Loop &L, ElementCount VF) const {
5450   // Cross iteration phis such as reductions need special handling and are
5451   // currently unsupported.
5452   if (any_of(L.getHeader()->phis(),
5453              [&](PHINode &Phi) { return Legal->isFirstOrderRecurrence(&Phi); }))
5454     return false;
5455 
5456   // Phis with uses outside of the loop require special handling and are
5457   // currently unsupported.
5458   for (auto &Entry : Legal->getInductionVars()) {
5459     // Look for uses of the value of the induction at the last iteration.
5460     Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
5461     for (User *U : PostInc->users())
5462       if (!L.contains(cast<Instruction>(U)))
5463         return false;
5464     // Look for uses of penultimate value of the induction.
5465     for (User *U : Entry.first->users())
5466       if (!L.contains(cast<Instruction>(U)))
5467         return false;
5468   }
5469 
5470   // Induction variables that are widened require special handling that is
5471   // currently not supported.
5472   if (any_of(Legal->getInductionVars(), [&](auto &Entry) {
5473         return !(this->isScalarAfterVectorization(Entry.first, VF) ||
5474                  this->isProfitableToScalarize(Entry.first, VF));
5475       }))
5476     return false;
5477 
5478   // Epilogue vectorization code has not been auditted to ensure it handles
5479   // non-latch exits properly.  It may be fine, but it needs auditted and
5480   // tested.
5481   if (L.getExitingBlock() != L.getLoopLatch())
5482     return false;
5483 
5484   return true;
5485 }
5486 
5487 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5488     const ElementCount VF) const {
5489   // FIXME: We need a much better cost-model to take different parameters such
5490   // as register pressure, code size increase and cost of extra branches into
5491   // account. For now we apply a very crude heuristic and only consider loops
5492   // with vectorization factors larger than a certain value.
5493   // We also consider epilogue vectorization unprofitable for targets that don't
5494   // consider interleaving beneficial (eg. MVE).
5495   if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
5496     return false;
5497   // FIXME: We should consider changing the threshold for scalable
5498   // vectors to take VScaleForTuning into account.
5499   if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF)
5500     return true;
5501   return false;
5502 }
5503 
5504 VectorizationFactor
5505 LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
5506     const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
5507   VectorizationFactor Result = VectorizationFactor::Disabled();
5508   if (!EnableEpilogueVectorization) {
5509     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
5510     return Result;
5511   }
5512 
5513   if (!isScalarEpilogueAllowed()) {
5514     LLVM_DEBUG(
5515         dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
5516                   "allowed.\n";);
5517     return Result;
5518   }
5519 
5520   // Not really a cost consideration, but check for unsupported cases here to
5521   // simplify the logic.
5522   if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
5523     LLVM_DEBUG(
5524         dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
5525                   "not a supported candidate.\n";);
5526     return Result;
5527   }
5528 
5529   if (EpilogueVectorizationForceVF > 1) {
5530     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
5531     ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
5532     if (LVP.hasPlanWithVF(ForcedEC))
5533       return {ForcedEC, 0};
5534     else {
5535       LLVM_DEBUG(
5536           dbgs()
5537               << "LEV: Epilogue vectorization forced factor is not viable.\n";);
5538       return Result;
5539     }
5540   }
5541 
5542   if (TheLoop->getHeader()->getParent()->hasOptSize() ||
5543       TheLoop->getHeader()->getParent()->hasMinSize()) {
5544     LLVM_DEBUG(
5545         dbgs()
5546             << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
5547     return Result;
5548   }
5549 
5550   if (!isEpilogueVectorizationProfitable(MainLoopVF)) {
5551     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
5552                          "this loop\n");
5553     return Result;
5554   }
5555 
5556   // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
5557   // the main loop handles 8 lanes per iteration. We could still benefit from
5558   // vectorizing the epilogue loop with VF=4.
5559   ElementCount EstimatedRuntimeVF = MainLoopVF;
5560   if (MainLoopVF.isScalable()) {
5561     EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
5562     if (Optional<unsigned> VScale = getVScaleForTuning())
5563       EstimatedRuntimeVF *= VScale.getValue();
5564   }
5565 
5566   for (auto &NextVF : ProfitableVFs)
5567     if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
5568           ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) ||
5569          ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) &&
5570         (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) &&
5571         LVP.hasPlanWithVF(NextVF.Width))
5572       Result = NextVF;
5573 
5574   if (Result != VectorizationFactor::Disabled())
5575     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5576                       << Result.Width << "\n";);
5577   return Result;
5578 }
5579 
5580 std::pair<unsigned, unsigned>
5581 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5582   unsigned MinWidth = -1U;
5583   unsigned MaxWidth = 8;
5584   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5585   // For in-loop reductions, no element types are added to ElementTypesInLoop
5586   // if there are no loads/stores in the loop. In this case, check through the
5587   // reduction variables to determine the maximum width.
5588   if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
5589     // Reset MaxWidth so that we can find the smallest type used by recurrences
5590     // in the loop.
5591     MaxWidth = -1U;
5592     for (auto &PhiDescriptorPair : Legal->getReductionVars()) {
5593       const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
5594       // When finding the min width used by the recurrence we need to account
5595       // for casts on the input operands of the recurrence.
5596       MaxWidth = std::min<unsigned>(
5597           MaxWidth, std::min<unsigned>(
5598                         RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
5599                         RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
5600     }
5601   } else {
5602     for (Type *T : ElementTypesInLoop) {
5603       MinWidth = std::min<unsigned>(
5604           MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
5605       MaxWidth = std::max<unsigned>(
5606           MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
5607     }
5608   }
5609   return {MinWidth, MaxWidth};
5610 }
5611 
5612 void LoopVectorizationCostModel::collectElementTypesForWidening() {
5613   ElementTypesInLoop.clear();
5614   // For each block.
5615   for (BasicBlock *BB : TheLoop->blocks()) {
5616     // For each instruction in the loop.
5617     for (Instruction &I : BB->instructionsWithoutDebug()) {
5618       Type *T = I.getType();
5619 
5620       // Skip ignored values.
5621       if (ValuesToIgnore.count(&I))
5622         continue;
5623 
5624       // Only examine Loads, Stores and PHINodes.
5625       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5626         continue;
5627 
5628       // Examine PHI nodes that are reduction variables. Update the type to
5629       // account for the recurrence type.
5630       if (auto *PN = dyn_cast<PHINode>(&I)) {
5631         if (!Legal->isReductionVariable(PN))
5632           continue;
5633         const RecurrenceDescriptor &RdxDesc =
5634             Legal->getReductionVars().find(PN)->second;
5635         if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
5636             TTI.preferInLoopReduction(RdxDesc.getOpcode(),
5637                                       RdxDesc.getRecurrenceType(),
5638                                       TargetTransformInfo::ReductionFlags()))
5639           continue;
5640         T = RdxDesc.getRecurrenceType();
5641       }
5642 
5643       // Examine the stored values.
5644       if (auto *ST = dyn_cast<StoreInst>(&I))
5645         T = ST->getValueOperand()->getType();
5646 
5647       assert(T->isSized() &&
5648              "Expected the load/store/recurrence type to be sized");
5649 
5650       ElementTypesInLoop.insert(T);
5651     }
5652   }
5653 }
5654 
5655 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5656                                                            unsigned LoopCost) {
5657   // -- The interleave heuristics --
5658   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5659   // There are many micro-architectural considerations that we can't predict
5660   // at this level. For example, frontend pressure (on decode or fetch) due to
5661   // code size, or the number and capabilities of the execution ports.
5662   //
5663   // We use the following heuristics to select the interleave count:
5664   // 1. If the code has reductions, then we interleave to break the cross
5665   // iteration dependency.
5666   // 2. If the loop is really small, then we interleave to reduce the loop
5667   // overhead.
5668   // 3. We don't interleave if we think that we will spill registers to memory
5669   // due to the increased register pressure.
5670 
5671   if (!isScalarEpilogueAllowed())
5672     return 1;
5673 
5674   // We used the distance for the interleave count.
5675   if (Legal->getMaxSafeDepDistBytes() != -1U)
5676     return 1;
5677 
5678   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5679   const bool HasReductions = !Legal->getReductionVars().empty();
5680   // Do not interleave loops with a relatively small known or estimated trip
5681   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
5682   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
5683   // because with the above conditions interleaving can expose ILP and break
5684   // cross iteration dependences for reductions.
5685   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
5686       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
5687     return 1;
5688 
5689   // If we did not calculate the cost for VF (because the user selected the VF)
5690   // then we calculate the cost of VF here.
5691   if (LoopCost == 0) {
5692     InstructionCost C = expectedCost(VF).first;
5693     assert(C.isValid() && "Expected to have chosen a VF with valid cost");
5694     LoopCost = *C.getValue();
5695 
5696     // Loop body is free and there is no need for interleaving.
5697     if (LoopCost == 0)
5698       return 1;
5699   }
5700 
5701   RegisterUsage R = calculateRegisterUsage({VF})[0];
5702   // We divide by these constants so assume that we have at least one
5703   // instruction that uses at least one register.
5704   for (auto& pair : R.MaxLocalUsers) {
5705     pair.second = std::max(pair.second, 1U);
5706   }
5707 
5708   // We calculate the interleave count using the following formula.
5709   // Subtract the number of loop invariants from the number of available
5710   // registers. These registers are used by all of the interleaved instances.
5711   // Next, divide the remaining registers by the number of registers that is
5712   // required by the loop, in order to estimate how many parallel instances
5713   // fit without causing spills. All of this is rounded down if necessary to be
5714   // a power of two. We want power of two interleave count to simplify any
5715   // addressing operations or alignment considerations.
5716   // We also want power of two interleave counts to ensure that the induction
5717   // variable of the vector loop wraps to zero, when tail is folded by masking;
5718   // this currently happens when OptForSize, in which case IC is set to 1 above.
5719   unsigned IC = UINT_MAX;
5720 
5721   for (auto& pair : R.MaxLocalUsers) {
5722     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5723     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5724                       << " registers of "
5725                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5726     if (VF.isScalar()) {
5727       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5728         TargetNumRegisters = ForceTargetNumScalarRegs;
5729     } else {
5730       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5731         TargetNumRegisters = ForceTargetNumVectorRegs;
5732     }
5733     unsigned MaxLocalUsers = pair.second;
5734     unsigned LoopInvariantRegs = 0;
5735     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5736       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5737 
5738     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5739     // Don't count the induction variable as interleaved.
5740     if (EnableIndVarRegisterHeur) {
5741       TmpIC =
5742           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5743                         std::max(1U, (MaxLocalUsers - 1)));
5744     }
5745 
5746     IC = std::min(IC, TmpIC);
5747   }
5748 
5749   // Clamp the interleave ranges to reasonable counts.
5750   unsigned MaxInterleaveCount =
5751       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
5752 
5753   // Check if the user has overridden the max.
5754   if (VF.isScalar()) {
5755     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5756       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5757   } else {
5758     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5759       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5760   }
5761 
5762   // If trip count is known or estimated compile time constant, limit the
5763   // interleave count to be less than the trip count divided by VF, provided it
5764   // is at least 1.
5765   //
5766   // For scalable vectors we can't know if interleaving is beneficial. It may
5767   // not be beneficial for small loops if none of the lanes in the second vector
5768   // iterations is enabled. However, for larger loops, there is likely to be a
5769   // similar benefit as for fixed-width vectors. For now, we choose to leave
5770   // the InterleaveCount as if vscale is '1', although if some information about
5771   // the vector is known (e.g. min vector size), we can make a better decision.
5772   if (BestKnownTC) {
5773     MaxInterleaveCount =
5774         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
5775     // Make sure MaxInterleaveCount is greater than 0.
5776     MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
5777   }
5778 
5779   assert(MaxInterleaveCount > 0 &&
5780          "Maximum interleave count must be greater than 0");
5781 
5782   // Clamp the calculated IC to be between the 1 and the max interleave count
5783   // that the target and trip count allows.
5784   if (IC > MaxInterleaveCount)
5785     IC = MaxInterleaveCount;
5786   else
5787     // Make sure IC is greater than 0.
5788     IC = std::max(1u, IC);
5789 
5790   assert(IC > 0 && "Interleave count must be greater than 0.");
5791 
5792   // Interleave if we vectorized this loop and there is a reduction that could
5793   // benefit from interleaving.
5794   if (VF.isVector() && HasReductions) {
5795     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5796     return IC;
5797   }
5798 
5799   // For any scalar loop that either requires runtime checks or predication we
5800   // are better off leaving this to the unroller. Note that if we've already
5801   // vectorized the loop we will have done the runtime check and so interleaving
5802   // won't require further checks.
5803   bool ScalarInterleavingRequiresPredication =
5804       (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
5805          return Legal->blockNeedsPredication(BB);
5806        }));
5807   bool ScalarInterleavingRequiresRuntimePointerCheck =
5808       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
5809 
5810   // We want to interleave small loops in order to reduce the loop overhead and
5811   // potentially expose ILP opportunities.
5812   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5813                     << "LV: IC is " << IC << '\n'
5814                     << "LV: VF is " << VF << '\n');
5815   const bool AggressivelyInterleaveReductions =
5816       TTI.enableAggressiveInterleaving(HasReductions);
5817   if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5818       !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5819     // We assume that the cost overhead is 1 and we use the cost model
5820     // to estimate the cost of the loop and interleave until the cost of the
5821     // loop overhead is about 5% of the cost of the loop.
5822     unsigned SmallIC =
5823         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5824 
5825     // Interleave until store/load ports (estimated by max interleave count) are
5826     // saturated.
5827     unsigned NumStores = Legal->getNumStores();
5828     unsigned NumLoads = Legal->getNumLoads();
5829     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5830     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5831 
5832     // There is little point in interleaving for reductions containing selects
5833     // and compares when VF=1 since it may just create more overhead than it's
5834     // worth for loops with small trip counts. This is because we still have to
5835     // do the final reduction after the loop.
5836     bool HasSelectCmpReductions =
5837         HasReductions &&
5838         any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5839           const RecurrenceDescriptor &RdxDesc = Reduction.second;
5840           return RecurrenceDescriptor::isSelectCmpRecurrenceKind(
5841               RdxDesc.getRecurrenceKind());
5842         });
5843     if (HasSelectCmpReductions) {
5844       LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5845       return 1;
5846     }
5847 
5848     // If we have a scalar reduction (vector reductions are already dealt with
5849     // by this point), we can increase the critical path length if the loop
5850     // we're interleaving is inside another loop. For tree-wise reductions
5851     // set the limit to 2, and for ordered reductions it's best to disable
5852     // interleaving entirely.
5853     if (HasReductions && TheLoop->getLoopDepth() > 1) {
5854       bool HasOrderedReductions =
5855           any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5856             const RecurrenceDescriptor &RdxDesc = Reduction.second;
5857             return RdxDesc.isOrdered();
5858           });
5859       if (HasOrderedReductions) {
5860         LLVM_DEBUG(
5861             dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5862         return 1;
5863       }
5864 
5865       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5866       SmallIC = std::min(SmallIC, F);
5867       StoresIC = std::min(StoresIC, F);
5868       LoadsIC = std::min(LoadsIC, F);
5869     }
5870 
5871     if (EnableLoadStoreRuntimeInterleave &&
5872         std::max(StoresIC, LoadsIC) > SmallIC) {
5873       LLVM_DEBUG(
5874           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5875       return std::max(StoresIC, LoadsIC);
5876     }
5877 
5878     // If there are scalar reductions and TTI has enabled aggressive
5879     // interleaving for reductions, we will interleave to expose ILP.
5880     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
5881         AggressivelyInterleaveReductions) {
5882       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5883       // Interleave no less than SmallIC but not as aggressive as the normal IC
5884       // to satisfy the rare situation when resources are too limited.
5885       return std::max(IC / 2, SmallIC);
5886     } else {
5887       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5888       return SmallIC;
5889     }
5890   }
5891 
5892   // Interleave if this is a large loop (small loops are already dealt with by
5893   // this point) that could benefit from interleaving.
5894   if (AggressivelyInterleaveReductions) {
5895     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5896     return IC;
5897   }
5898 
5899   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5900   return 1;
5901 }
5902 
5903 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5904 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
5905   // This function calculates the register usage by measuring the highest number
5906   // of values that are alive at a single location. Obviously, this is a very
5907   // rough estimation. We scan the loop in a topological order in order and
5908   // assign a number to each instruction. We use RPO to ensure that defs are
5909   // met before their users. We assume that each instruction that has in-loop
5910   // users starts an interval. We record every time that an in-loop value is
5911   // used, so we have a list of the first and last occurrences of each
5912   // instruction. Next, we transpose this data structure into a multi map that
5913   // holds the list of intervals that *end* at a specific location. This multi
5914   // map allows us to perform a linear search. We scan the instructions linearly
5915   // and record each time that a new interval starts, by placing it in a set.
5916   // If we find this value in the multi-map then we remove it from the set.
5917   // The max register usage is the maximum size of the set.
5918   // We also search for instructions that are defined outside the loop, but are
5919   // used inside the loop. We need this number separately from the max-interval
5920   // usage number because when we unroll, loop-invariant values do not take
5921   // more register.
5922   LoopBlocksDFS DFS(TheLoop);
5923   DFS.perform(LI);
5924 
5925   RegisterUsage RU;
5926 
5927   // Each 'key' in the map opens a new interval. The values
5928   // of the map are the index of the 'last seen' usage of the
5929   // instruction that is the key.
5930   using IntervalMap = DenseMap<Instruction *, unsigned>;
5931 
5932   // Maps instruction to its index.
5933   SmallVector<Instruction *, 64> IdxToInstr;
5934   // Marks the end of each interval.
5935   IntervalMap EndPoint;
5936   // Saves the list of instruction indices that are used in the loop.
5937   SmallPtrSet<Instruction *, 8> Ends;
5938   // Saves the list of values that are used in the loop but are
5939   // defined outside the loop, such as arguments and constants.
5940   SmallPtrSet<Value *, 8> LoopInvariants;
5941 
5942   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5943     for (Instruction &I : BB->instructionsWithoutDebug()) {
5944       IdxToInstr.push_back(&I);
5945 
5946       // Save the end location of each USE.
5947       for (Value *U : I.operands()) {
5948         auto *Instr = dyn_cast<Instruction>(U);
5949 
5950         // Ignore non-instruction values such as arguments, constants, etc.
5951         if (!Instr)
5952           continue;
5953 
5954         // If this instruction is outside the loop then record it and continue.
5955         if (!TheLoop->contains(Instr)) {
5956           LoopInvariants.insert(Instr);
5957           continue;
5958         }
5959 
5960         // Overwrite previous end points.
5961         EndPoint[Instr] = IdxToInstr.size();
5962         Ends.insert(Instr);
5963       }
5964     }
5965   }
5966 
5967   // Saves the list of intervals that end with the index in 'key'.
5968   using InstrList = SmallVector<Instruction *, 2>;
5969   DenseMap<unsigned, InstrList> TransposeEnds;
5970 
5971   // Transpose the EndPoints to a list of values that end at each index.
5972   for (auto &Interval : EndPoint)
5973     TransposeEnds[Interval.second].push_back(Interval.first);
5974 
5975   SmallPtrSet<Instruction *, 8> OpenIntervals;
5976   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5977   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5978 
5979   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5980 
5981   auto GetRegUsage = [&TTI = TTI](Type *Ty, ElementCount VF) -> unsigned {
5982     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
5983       return 0;
5984     return TTI.getRegUsageForType(VectorType::get(Ty, VF));
5985   };
5986 
5987   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5988     Instruction *I = IdxToInstr[i];
5989 
5990     // Remove all of the instructions that end at this location.
5991     InstrList &List = TransposeEnds[i];
5992     for (Instruction *ToRemove : List)
5993       OpenIntervals.erase(ToRemove);
5994 
5995     // Ignore instructions that are never used within the loop.
5996     if (!Ends.count(I))
5997       continue;
5998 
5999     // Skip ignored values.
6000     if (ValuesToIgnore.count(I))
6001       continue;
6002 
6003     // For each VF find the maximum usage of registers.
6004     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
6005       // Count the number of live intervals.
6006       SmallMapVector<unsigned, unsigned, 4> RegUsage;
6007 
6008       if (VFs[j].isScalar()) {
6009         for (auto Inst : OpenIntervals) {
6010           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6011           if (RegUsage.find(ClassID) == RegUsage.end())
6012             RegUsage[ClassID] = 1;
6013           else
6014             RegUsage[ClassID] += 1;
6015         }
6016       } else {
6017         collectUniformsAndScalars(VFs[j]);
6018         for (auto Inst : OpenIntervals) {
6019           // Skip ignored values for VF > 1.
6020           if (VecValuesToIgnore.count(Inst))
6021             continue;
6022           if (isScalarAfterVectorization(Inst, VFs[j])) {
6023             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6024             if (RegUsage.find(ClassID) == RegUsage.end())
6025               RegUsage[ClassID] = 1;
6026             else
6027               RegUsage[ClassID] += 1;
6028           } else {
6029             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
6030             if (RegUsage.find(ClassID) == RegUsage.end())
6031               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
6032             else
6033               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
6034           }
6035         }
6036       }
6037 
6038       for (auto& pair : RegUsage) {
6039         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
6040           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
6041         else
6042           MaxUsages[j][pair.first] = pair.second;
6043       }
6044     }
6045 
6046     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
6047                       << OpenIntervals.size() << '\n');
6048 
6049     // Add the current instruction to the list of open intervals.
6050     OpenIntervals.insert(I);
6051   }
6052 
6053   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6054     SmallMapVector<unsigned, unsigned, 4> Invariant;
6055 
6056     for (auto Inst : LoopInvariants) {
6057       unsigned Usage =
6058           VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
6059       unsigned ClassID =
6060           TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
6061       if (Invariant.find(ClassID) == Invariant.end())
6062         Invariant[ClassID] = Usage;
6063       else
6064         Invariant[ClassID] += Usage;
6065     }
6066 
6067     LLVM_DEBUG({
6068       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
6069       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
6070              << " item\n";
6071       for (const auto &pair : MaxUsages[i]) {
6072         dbgs() << "LV(REG): RegisterClass: "
6073                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6074                << " registers\n";
6075       }
6076       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
6077              << " item\n";
6078       for (const auto &pair : Invariant) {
6079         dbgs() << "LV(REG): RegisterClass: "
6080                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6081                << " registers\n";
6082       }
6083     });
6084 
6085     RU.LoopInvariantRegs = Invariant;
6086     RU.MaxLocalUsers = MaxUsages[i];
6087     RUs[i] = RU;
6088   }
6089 
6090   return RUs;
6091 }
6092 
6093 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
6094                                                            ElementCount VF) {
6095   // TODO: Cost model for emulated masked load/store is completely
6096   // broken. This hack guides the cost model to use an artificially
6097   // high enough value to practically disable vectorization with such
6098   // operations, except where previously deployed legality hack allowed
6099   // using very low cost values. This is to avoid regressions coming simply
6100   // from moving "masked load/store" check from legality to cost model.
6101   // Masked Load/Gather emulation was previously never allowed.
6102   // Limited number of Masked Store/Scatter emulation was allowed.
6103   assert(isPredicatedInst(I, VF) && "Expecting a scalar emulated instruction");
6104   return isa<LoadInst>(I) ||
6105          (isa<StoreInst>(I) &&
6106           NumPredStores > NumberOfStoresToPredicate);
6107 }
6108 
6109 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
6110   // If we aren't vectorizing the loop, or if we've already collected the
6111   // instructions to scalarize, there's nothing to do. Collection may already
6112   // have occurred if we have a user-selected VF and are now computing the
6113   // expected cost for interleaving.
6114   if (VF.isScalar() || VF.isZero() ||
6115       InstsToScalarize.find(VF) != InstsToScalarize.end())
6116     return;
6117 
6118   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6119   // not profitable to scalarize any instructions, the presence of VF in the
6120   // map will indicate that we've analyzed it already.
6121   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6122 
6123   // Find all the instructions that are scalar with predication in the loop and
6124   // determine if it would be better to not if-convert the blocks they are in.
6125   // If so, we also record the instructions to scalarize.
6126   for (BasicBlock *BB : TheLoop->blocks()) {
6127     if (!blockNeedsPredicationForAnyReason(BB))
6128       continue;
6129     for (Instruction &I : *BB)
6130       if (isScalarWithPredication(&I, VF)) {
6131         ScalarCostsTy ScalarCosts;
6132         // Do not apply discount if scalable, because that would lead to
6133         // invalid scalarization costs.
6134         // Do not apply discount logic if hacked cost is needed
6135         // for emulated masked memrefs.
6136         if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) &&
6137             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6138           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6139         // Remember that BB will remain after vectorization.
6140         PredicatedBBsAfterVectorization.insert(BB);
6141       }
6142   }
6143 }
6144 
6145 int LoopVectorizationCostModel::computePredInstDiscount(
6146     Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
6147   assert(!isUniformAfterVectorization(PredInst, VF) &&
6148          "Instruction marked uniform-after-vectorization will be predicated");
6149 
6150   // Initialize the discount to zero, meaning that the scalar version and the
6151   // vector version cost the same.
6152   InstructionCost Discount = 0;
6153 
6154   // Holds instructions to analyze. The instructions we visit are mapped in
6155   // ScalarCosts. Those instructions are the ones that would be scalarized if
6156   // we find that the scalar version costs less.
6157   SmallVector<Instruction *, 8> Worklist;
6158 
6159   // Returns true if the given instruction can be scalarized.
6160   auto canBeScalarized = [&](Instruction *I) -> bool {
6161     // We only attempt to scalarize instructions forming a single-use chain
6162     // from the original predicated block that would otherwise be vectorized.
6163     // Although not strictly necessary, we give up on instructions we know will
6164     // already be scalar to avoid traversing chains that are unlikely to be
6165     // beneficial.
6166     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6167         isScalarAfterVectorization(I, VF))
6168       return false;
6169 
6170     // If the instruction is scalar with predication, it will be analyzed
6171     // separately. We ignore it within the context of PredInst.
6172     if (isScalarWithPredication(I, VF))
6173       return false;
6174 
6175     // If any of the instruction's operands are uniform after vectorization,
6176     // the instruction cannot be scalarized. This prevents, for example, a
6177     // masked load from being scalarized.
6178     //
6179     // We assume we will only emit a value for lane zero of an instruction
6180     // marked uniform after vectorization, rather than VF identical values.
6181     // Thus, if we scalarize an instruction that uses a uniform, we would
6182     // create uses of values corresponding to the lanes we aren't emitting code
6183     // for. This behavior can be changed by allowing getScalarValue to clone
6184     // the lane zero values for uniforms rather than asserting.
6185     for (Use &U : I->operands())
6186       if (auto *J = dyn_cast<Instruction>(U.get()))
6187         if (isUniformAfterVectorization(J, VF))
6188           return false;
6189 
6190     // Otherwise, we can scalarize the instruction.
6191     return true;
6192   };
6193 
6194   // Compute the expected cost discount from scalarizing the entire expression
6195   // feeding the predicated instruction. We currently only consider expressions
6196   // that are single-use instruction chains.
6197   Worklist.push_back(PredInst);
6198   while (!Worklist.empty()) {
6199     Instruction *I = Worklist.pop_back_val();
6200 
6201     // If we've already analyzed the instruction, there's nothing to do.
6202     if (ScalarCosts.find(I) != ScalarCosts.end())
6203       continue;
6204 
6205     // Compute the cost of the vector instruction. Note that this cost already
6206     // includes the scalarization overhead of the predicated instruction.
6207     InstructionCost VectorCost = getInstructionCost(I, VF).first;
6208 
6209     // Compute the cost of the scalarized instruction. This cost is the cost of
6210     // the instruction as if it wasn't if-converted and instead remained in the
6211     // predicated block. We will scale this cost by block probability after
6212     // computing the scalarization overhead.
6213     InstructionCost ScalarCost =
6214         VF.getFixedValue() *
6215         getInstructionCost(I, ElementCount::getFixed(1)).first;
6216 
6217     // Compute the scalarization overhead of needed insertelement instructions
6218     // and phi nodes.
6219     if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
6220       ScalarCost += TTI.getScalarizationOverhead(
6221           cast<VectorType>(ToVectorTy(I->getType(), VF)),
6222           APInt::getAllOnes(VF.getFixedValue()), true, false);
6223       ScalarCost +=
6224           VF.getFixedValue() *
6225           TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
6226     }
6227 
6228     // Compute the scalarization overhead of needed extractelement
6229     // instructions. For each of the instruction's operands, if the operand can
6230     // be scalarized, add it to the worklist; otherwise, account for the
6231     // overhead.
6232     for (Use &U : I->operands())
6233       if (auto *J = dyn_cast<Instruction>(U.get())) {
6234         assert(VectorType::isValidElementType(J->getType()) &&
6235                "Instruction has non-scalar type");
6236         if (canBeScalarized(J))
6237           Worklist.push_back(J);
6238         else if (needsExtract(J, VF)) {
6239           ScalarCost += TTI.getScalarizationOverhead(
6240               cast<VectorType>(ToVectorTy(J->getType(), VF)),
6241               APInt::getAllOnes(VF.getFixedValue()), false, true);
6242         }
6243       }
6244 
6245     // Scale the total scalar cost by block probability.
6246     ScalarCost /= getReciprocalPredBlockProb();
6247 
6248     // Compute the discount. A non-negative discount means the vector version
6249     // of the instruction costs more, and scalarizing would be beneficial.
6250     Discount += VectorCost - ScalarCost;
6251     ScalarCosts[I] = ScalarCost;
6252   }
6253 
6254   return *Discount.getValue();
6255 }
6256 
6257 LoopVectorizationCostModel::VectorizationCostTy
6258 LoopVectorizationCostModel::expectedCost(
6259     ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
6260   VectorizationCostTy Cost;
6261 
6262   // For each block.
6263   for (BasicBlock *BB : TheLoop->blocks()) {
6264     VectorizationCostTy BlockCost;
6265 
6266     // For each instruction in the old loop.
6267     for (Instruction &I : BB->instructionsWithoutDebug()) {
6268       // Skip ignored values.
6269       if (ValuesToIgnore.count(&I) ||
6270           (VF.isVector() && VecValuesToIgnore.count(&I)))
6271         continue;
6272 
6273       VectorizationCostTy C = getInstructionCost(&I, VF);
6274 
6275       // Check if we should override the cost.
6276       if (C.first.isValid() &&
6277           ForceTargetInstructionCost.getNumOccurrences() > 0)
6278         C.first = InstructionCost(ForceTargetInstructionCost);
6279 
6280       // Keep a list of instructions with invalid costs.
6281       if (Invalid && !C.first.isValid())
6282         Invalid->emplace_back(&I, VF);
6283 
6284       BlockCost.first += C.first;
6285       BlockCost.second |= C.second;
6286       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6287                         << " for VF " << VF << " For instruction: " << I
6288                         << '\n');
6289     }
6290 
6291     // If we are vectorizing a predicated block, it will have been
6292     // if-converted. This means that the block's instructions (aside from
6293     // stores and instructions that may divide by zero) will now be
6294     // unconditionally executed. For the scalar case, we may not always execute
6295     // the predicated block, if it is an if-else block. Thus, scale the block's
6296     // cost by the probability of executing it. blockNeedsPredication from
6297     // Legal is used so as to not include all blocks in tail folded loops.
6298     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
6299       BlockCost.first /= getReciprocalPredBlockProb();
6300 
6301     Cost.first += BlockCost.first;
6302     Cost.second |= BlockCost.second;
6303   }
6304 
6305   return Cost;
6306 }
6307 
6308 /// Gets Address Access SCEV after verifying that the access pattern
6309 /// is loop invariant except the induction variable dependence.
6310 ///
6311 /// This SCEV can be sent to the Target in order to estimate the address
6312 /// calculation cost.
6313 static const SCEV *getAddressAccessSCEV(
6314               Value *Ptr,
6315               LoopVectorizationLegality *Legal,
6316               PredicatedScalarEvolution &PSE,
6317               const Loop *TheLoop) {
6318 
6319   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6320   if (!Gep)
6321     return nullptr;
6322 
6323   // We are looking for a gep with all loop invariant indices except for one
6324   // which should be an induction variable.
6325   auto SE = PSE.getSE();
6326   unsigned NumOperands = Gep->getNumOperands();
6327   for (unsigned i = 1; i < NumOperands; ++i) {
6328     Value *Opd = Gep->getOperand(i);
6329     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6330         !Legal->isInductionVariable(Opd))
6331       return nullptr;
6332   }
6333 
6334   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6335   return PSE.getSCEV(Ptr);
6336 }
6337 
6338 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6339   return Legal->hasStride(I->getOperand(0)) ||
6340          Legal->hasStride(I->getOperand(1));
6341 }
6342 
6343 InstructionCost
6344 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6345                                                         ElementCount VF) {
6346   assert(VF.isVector() &&
6347          "Scalarization cost of instruction implies vectorization.");
6348   if (VF.isScalable())
6349     return InstructionCost::getInvalid();
6350 
6351   Type *ValTy = getLoadStoreType(I);
6352   auto SE = PSE.getSE();
6353 
6354   unsigned AS = getLoadStoreAddressSpace(I);
6355   Value *Ptr = getLoadStorePointerOperand(I);
6356   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6357   // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
6358   //       that it is being called from this specific place.
6359 
6360   // Figure out whether the access is strided and get the stride value
6361   // if it's known in compile time
6362   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6363 
6364   // Get the cost of the scalar memory instruction and address computation.
6365   InstructionCost Cost =
6366       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6367 
6368   // Don't pass *I here, since it is scalar but will actually be part of a
6369   // vectorized loop where the user of it is a vectorized instruction.
6370   const Align Alignment = getLoadStoreAlignment(I);
6371   Cost += VF.getKnownMinValue() *
6372           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
6373                               AS, TTI::TCK_RecipThroughput);
6374 
6375   // Get the overhead of the extractelement and insertelement instructions
6376   // we might create due to scalarization.
6377   Cost += getScalarizationOverhead(I, VF);
6378 
6379   // If we have a predicated load/store, it will need extra i1 extracts and
6380   // conditional branches, but may not be executed for each vector lane. Scale
6381   // the cost by the probability of executing the predicated block.
6382   if (isPredicatedInst(I, VF)) {
6383     Cost /= getReciprocalPredBlockProb();
6384 
6385     // Add the cost of an i1 extract and a branch
6386     auto *Vec_i1Ty =
6387         VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
6388     Cost += TTI.getScalarizationOverhead(
6389         Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
6390         /*Insert=*/false, /*Extract=*/true);
6391     Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput);
6392 
6393     if (useEmulatedMaskMemRefHack(I, VF))
6394       // Artificially setting to a high enough value to practically disable
6395       // vectorization with such operations.
6396       Cost = 3000000;
6397   }
6398 
6399   return Cost;
6400 }
6401 
6402 InstructionCost
6403 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6404                                                     ElementCount VF) {
6405   Type *ValTy = getLoadStoreType(I);
6406   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6407   Value *Ptr = getLoadStorePointerOperand(I);
6408   unsigned AS = getLoadStoreAddressSpace(I);
6409   int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
6410   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6411 
6412   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6413          "Stride should be 1 or -1 for consecutive memory access");
6414   const Align Alignment = getLoadStoreAlignment(I);
6415   InstructionCost Cost = 0;
6416   if (Legal->isMaskRequired(I))
6417     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6418                                       CostKind);
6419   else
6420     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6421                                 CostKind, I);
6422 
6423   bool Reverse = ConsecutiveStride < 0;
6424   if (Reverse)
6425     Cost +=
6426         TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
6427   return Cost;
6428 }
6429 
6430 InstructionCost
6431 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6432                                                 ElementCount VF) {
6433   assert(Legal->isUniformMemOp(*I));
6434 
6435   Type *ValTy = getLoadStoreType(I);
6436   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6437   const Align Alignment = getLoadStoreAlignment(I);
6438   unsigned AS = getLoadStoreAddressSpace(I);
6439   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6440   if (isa<LoadInst>(I)) {
6441     return TTI.getAddressComputationCost(ValTy) +
6442            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6443                                CostKind) +
6444            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6445   }
6446   StoreInst *SI = cast<StoreInst>(I);
6447 
6448   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6449   return TTI.getAddressComputationCost(ValTy) +
6450          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6451                              CostKind) +
6452          (isLoopInvariantStoreValue
6453               ? 0
6454               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6455                                        VF.getKnownMinValue() - 1));
6456 }
6457 
6458 InstructionCost
6459 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6460                                                  ElementCount VF) {
6461   Type *ValTy = getLoadStoreType(I);
6462   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6463   const Align Alignment = getLoadStoreAlignment(I);
6464   const Value *Ptr = getLoadStorePointerOperand(I);
6465 
6466   return TTI.getAddressComputationCost(VectorTy) +
6467          TTI.getGatherScatterOpCost(
6468              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6469              TargetTransformInfo::TCK_RecipThroughput, I);
6470 }
6471 
6472 InstructionCost
6473 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6474                                                    ElementCount VF) {
6475   // TODO: Once we have support for interleaving with scalable vectors
6476   // we can calculate the cost properly here.
6477   if (VF.isScalable())
6478     return InstructionCost::getInvalid();
6479 
6480   Type *ValTy = getLoadStoreType(I);
6481   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6482   unsigned AS = getLoadStoreAddressSpace(I);
6483 
6484   auto Group = getInterleavedAccessGroup(I);
6485   assert(Group && "Fail to get an interleaved access group.");
6486 
6487   unsigned InterleaveFactor = Group->getFactor();
6488   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6489 
6490   // Holds the indices of existing members in the interleaved group.
6491   SmallVector<unsigned, 4> Indices;
6492   for (unsigned IF = 0; IF < InterleaveFactor; IF++)
6493     if (Group->getMember(IF))
6494       Indices.push_back(IF);
6495 
6496   // Calculate the cost of the whole interleaved group.
6497   bool UseMaskForGaps =
6498       (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
6499       (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
6500   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
6501       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6502       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
6503 
6504   if (Group->isReverse()) {
6505     // TODO: Add support for reversed masked interleaved access.
6506     assert(!Legal->isMaskRequired(I) &&
6507            "Reverse masked interleaved access not supported.");
6508     Cost +=
6509         Group->getNumMembers() *
6510         TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
6511   }
6512   return Cost;
6513 }
6514 
6515 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(
6516     Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) {
6517   using namespace llvm::PatternMatch;
6518   // Early exit for no inloop reductions
6519   if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty))
6520     return None;
6521   auto *VectorTy = cast<VectorType>(Ty);
6522 
6523   // We are looking for a pattern of, and finding the minimal acceptable cost:
6524   //  reduce(mul(ext(A), ext(B))) or
6525   //  reduce(mul(A, B)) or
6526   //  reduce(ext(A)) or
6527   //  reduce(A).
6528   // The basic idea is that we walk down the tree to do that, finding the root
6529   // reduction instruction in InLoopReductionImmediateChains. From there we find
6530   // the pattern of mul/ext and test the cost of the entire pattern vs the cost
6531   // of the components. If the reduction cost is lower then we return it for the
6532   // reduction instruction and 0 for the other instructions in the pattern. If
6533   // it is not we return an invalid cost specifying the orignal cost method
6534   // should be used.
6535   Instruction *RetI = I;
6536   if (match(RetI, m_ZExtOrSExt(m_Value()))) {
6537     if (!RetI->hasOneUser())
6538       return None;
6539     RetI = RetI->user_back();
6540   }
6541   if (match(RetI, m_Mul(m_Value(), m_Value())) &&
6542       RetI->user_back()->getOpcode() == Instruction::Add) {
6543     if (!RetI->hasOneUser())
6544       return None;
6545     RetI = RetI->user_back();
6546   }
6547 
6548   // Test if the found instruction is a reduction, and if not return an invalid
6549   // cost specifying the parent to use the original cost modelling.
6550   if (!InLoopReductionImmediateChains.count(RetI))
6551     return None;
6552 
6553   // Find the reduction this chain is a part of and calculate the basic cost of
6554   // the reduction on its own.
6555   Instruction *LastChain = InLoopReductionImmediateChains[RetI];
6556   Instruction *ReductionPhi = LastChain;
6557   while (!isa<PHINode>(ReductionPhi))
6558     ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];
6559 
6560   const RecurrenceDescriptor &RdxDesc =
6561       Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
6562 
6563   InstructionCost BaseCost = TTI.getArithmeticReductionCost(
6564       RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
6565 
6566   // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
6567   // normal fmul instruction to the cost of the fadd reduction.
6568   if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)
6569     BaseCost +=
6570         TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
6571 
6572   // If we're using ordered reductions then we can just return the base cost
6573   // here, since getArithmeticReductionCost calculates the full ordered
6574   // reduction cost when FP reassociation is not allowed.
6575   if (useOrderedReductions(RdxDesc))
6576     return BaseCost;
6577 
6578   // Get the operand that was not the reduction chain and match it to one of the
6579   // patterns, returning the better cost if it is found.
6580   Instruction *RedOp = RetI->getOperand(1) == LastChain
6581                            ? dyn_cast<Instruction>(RetI->getOperand(0))
6582                            : dyn_cast<Instruction>(RetI->getOperand(1));
6583 
6584   VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
6585 
6586   Instruction *Op0, *Op1;
6587   if (RedOp &&
6588       match(RedOp,
6589             m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
6590       match(Op0, m_ZExtOrSExt(m_Value())) &&
6591       Op0->getOpcode() == Op1->getOpcode() &&
6592       Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
6593       !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
6594       (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
6595 
6596     // Matched reduce(ext(mul(ext(A), ext(B)))
6597     // Note that the extend opcodes need to all match, or if A==B they will have
6598     // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
6599     // which is equally fine.
6600     bool IsUnsigned = isa<ZExtInst>(Op0);
6601     auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
6602     auto *MulType = VectorType::get(Op0->getType(), VectorTy);
6603 
6604     InstructionCost ExtCost =
6605         TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
6606                              TTI::CastContextHint::None, CostKind, Op0);
6607     InstructionCost MulCost =
6608         TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
6609     InstructionCost Ext2Cost =
6610         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
6611                              TTI::CastContextHint::None, CostKind, RedOp);
6612 
6613     InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6614         /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6615         CostKind);
6616 
6617     if (RedCost.isValid() &&
6618         RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
6619       return I == RetI ? RedCost : 0;
6620   } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
6621              !TheLoop->isLoopInvariant(RedOp)) {
6622     // Matched reduce(ext(A))
6623     bool IsUnsigned = isa<ZExtInst>(RedOp);
6624     auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
6625     InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6626         /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6627         CostKind);
6628 
6629     InstructionCost ExtCost =
6630         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
6631                              TTI::CastContextHint::None, CostKind, RedOp);
6632     if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
6633       return I == RetI ? RedCost : 0;
6634   } else if (RedOp &&
6635              match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
6636     if (match(Op0, m_ZExtOrSExt(m_Value())) &&
6637         Op0->getOpcode() == Op1->getOpcode() &&
6638         !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
6639       bool IsUnsigned = isa<ZExtInst>(Op0);
6640       Type *Op0Ty = Op0->getOperand(0)->getType();
6641       Type *Op1Ty = Op1->getOperand(0)->getType();
6642       Type *LargestOpTy =
6643           Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
6644                                                                     : Op0Ty;
6645       auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
6646 
6647       // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of
6648       // different sizes. We take the largest type as the ext to reduce, and add
6649       // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
6650       InstructionCost ExtCost0 = TTI.getCastInstrCost(
6651           Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
6652           TTI::CastContextHint::None, CostKind, Op0);
6653       InstructionCost ExtCost1 = TTI.getCastInstrCost(
6654           Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
6655           TTI::CastContextHint::None, CostKind, Op1);
6656       InstructionCost MulCost =
6657           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6658 
6659       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6660           /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6661           CostKind);
6662       InstructionCost ExtraExtCost = 0;
6663       if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
6664         Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
6665         ExtraExtCost = TTI.getCastInstrCost(
6666             ExtraExtOp->getOpcode(), ExtType,
6667             VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
6668             TTI::CastContextHint::None, CostKind, ExtraExtOp);
6669       }
6670 
6671       if (RedCost.isValid() &&
6672           (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
6673         return I == RetI ? RedCost : 0;
6674     } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
6675       // Matched reduce(mul())
6676       InstructionCost MulCost =
6677           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6678 
6679       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6680           /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy,
6681           CostKind);
6682 
6683       if (RedCost.isValid() && RedCost < MulCost + BaseCost)
6684         return I == RetI ? RedCost : 0;
6685     }
6686   }
6687 
6688   return I == RetI ? Optional<InstructionCost>(BaseCost) : None;
6689 }
6690 
6691 InstructionCost
6692 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6693                                                      ElementCount VF) {
6694   // Calculate scalar cost only. Vectorization cost should be ready at this
6695   // moment.
6696   if (VF.isScalar()) {
6697     Type *ValTy = getLoadStoreType(I);
6698     const Align Alignment = getLoadStoreAlignment(I);
6699     unsigned AS = getLoadStoreAddressSpace(I);
6700 
6701     return TTI.getAddressComputationCost(ValTy) +
6702            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6703                                TTI::TCK_RecipThroughput, I);
6704   }
6705   return getWideningCost(I, VF);
6706 }
6707 
6708 LoopVectorizationCostModel::VectorizationCostTy
6709 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6710                                                ElementCount VF) {
6711   // If we know that this instruction will remain uniform, check the cost of
6712   // the scalar version.
6713   if (isUniformAfterVectorization(I, VF))
6714     VF = ElementCount::getFixed(1);
6715 
6716   if (VF.isVector() && isProfitableToScalarize(I, VF))
6717     return VectorizationCostTy(InstsToScalarize[VF][I], false);
6718 
6719   // Forced scalars do not have any scalarization overhead.
6720   auto ForcedScalar = ForcedScalars.find(VF);
6721   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6722     auto InstSet = ForcedScalar->second;
6723     if (InstSet.count(I))
6724       return VectorizationCostTy(
6725           (getInstructionCost(I, ElementCount::getFixed(1)).first *
6726            VF.getKnownMinValue()),
6727           false);
6728   }
6729 
6730   Type *VectorTy;
6731   InstructionCost C = getInstructionCost(I, VF, VectorTy);
6732 
6733   bool TypeNotScalarized = false;
6734   if (VF.isVector() && VectorTy->isVectorTy()) {
6735     unsigned NumParts = TTI.getNumberOfParts(VectorTy);
6736     if (NumParts)
6737       TypeNotScalarized = NumParts < VF.getKnownMinValue();
6738     else
6739       C = InstructionCost::getInvalid();
6740   }
6741   return VectorizationCostTy(C, TypeNotScalarized);
6742 }
6743 
6744 InstructionCost
6745 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
6746                                                      ElementCount VF) const {
6747 
6748   // There is no mechanism yet to create a scalable scalarization loop,
6749   // so this is currently Invalid.
6750   if (VF.isScalable())
6751     return InstructionCost::getInvalid();
6752 
6753   if (VF.isScalar())
6754     return 0;
6755 
6756   InstructionCost Cost = 0;
6757   Type *RetTy = ToVectorTy(I->getType(), VF);
6758   if (!RetTy->isVoidTy() &&
6759       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6760     Cost += TTI.getScalarizationOverhead(
6761         cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true,
6762         false);
6763 
6764   // Some targets keep addresses scalar.
6765   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6766     return Cost;
6767 
6768   // Some targets support efficient element stores.
6769   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6770     return Cost;
6771 
6772   // Collect operands to consider.
6773   CallInst *CI = dyn_cast<CallInst>(I);
6774   Instruction::op_range Ops = CI ? CI->args() : I->operands();
6775 
6776   // Skip operands that do not require extraction/scalarization and do not incur
6777   // any overhead.
6778   SmallVector<Type *> Tys;
6779   for (auto *V : filterExtractingOperands(Ops, VF))
6780     Tys.push_back(MaybeVectorizeType(V->getType(), VF));
6781   return Cost + TTI.getOperandsScalarizationOverhead(
6782                     filterExtractingOperands(Ops, VF), Tys);
6783 }
6784 
6785 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6786   if (VF.isScalar())
6787     return;
6788   NumPredStores = 0;
6789   for (BasicBlock *BB : TheLoop->blocks()) {
6790     // For each instruction in the old loop.
6791     for (Instruction &I : *BB) {
6792       Value *Ptr =  getLoadStorePointerOperand(&I);
6793       if (!Ptr)
6794         continue;
6795 
6796       // TODO: We should generate better code and update the cost model for
6797       // predicated uniform stores. Today they are treated as any other
6798       // predicated store (see added test cases in
6799       // invariant-store-vectorization.ll).
6800       if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
6801         NumPredStores++;
6802 
6803       if (Legal->isUniformMemOp(I)) {
6804         // TODO: Avoid replicating loads and stores instead of
6805         // relying on instcombine to remove them.
6806         // Load: Scalar load + broadcast
6807         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6808         InstructionCost Cost;
6809         if (isa<StoreInst>(&I) && VF.isScalable() &&
6810             isLegalGatherOrScatter(&I, VF)) {
6811           Cost = getGatherScatterCost(&I, VF);
6812           setWideningDecision(&I, VF, CM_GatherScatter, Cost);
6813         } else {
6814           assert((isa<LoadInst>(&I) || !VF.isScalable()) &&
6815                  "Cannot yet scalarize uniform stores");
6816           Cost = getUniformMemOpCost(&I, VF);
6817           setWideningDecision(&I, VF, CM_Scalarize, Cost);
6818         }
6819         continue;
6820       }
6821 
6822       // We assume that widening is the best solution when possible.
6823       if (memoryInstructionCanBeWidened(&I, VF)) {
6824         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6825         int ConsecutiveStride = Legal->isConsecutivePtr(
6826             getLoadStoreType(&I), getLoadStorePointerOperand(&I));
6827         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6828                "Expected consecutive stride.");
6829         InstWidening Decision =
6830             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6831         setWideningDecision(&I, VF, Decision, Cost);
6832         continue;
6833       }
6834 
6835       // Choose between Interleaving, Gather/Scatter or Scalarization.
6836       InstructionCost InterleaveCost = InstructionCost::getInvalid();
6837       unsigned NumAccesses = 1;
6838       if (isAccessInterleaved(&I)) {
6839         auto Group = getInterleavedAccessGroup(&I);
6840         assert(Group && "Fail to get an interleaved access group.");
6841 
6842         // Make one decision for the whole group.
6843         if (getWideningDecision(&I, VF) != CM_Unknown)
6844           continue;
6845 
6846         NumAccesses = Group->getNumMembers();
6847         if (interleavedAccessCanBeWidened(&I, VF))
6848           InterleaveCost = getInterleaveGroupCost(&I, VF);
6849       }
6850 
6851       InstructionCost GatherScatterCost =
6852           isLegalGatherOrScatter(&I, VF)
6853               ? getGatherScatterCost(&I, VF) * NumAccesses
6854               : InstructionCost::getInvalid();
6855 
6856       InstructionCost ScalarizationCost =
6857           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6858 
6859       // Choose better solution for the current VF,
6860       // write down this decision and use it during vectorization.
6861       InstructionCost Cost;
6862       InstWidening Decision;
6863       if (InterleaveCost <= GatherScatterCost &&
6864           InterleaveCost < ScalarizationCost) {
6865         Decision = CM_Interleave;
6866         Cost = InterleaveCost;
6867       } else if (GatherScatterCost < ScalarizationCost) {
6868         Decision = CM_GatherScatter;
6869         Cost = GatherScatterCost;
6870       } else {
6871         Decision = CM_Scalarize;
6872         Cost = ScalarizationCost;
6873       }
6874       // If the instructions belongs to an interleave group, the whole group
6875       // receives the same decision. The whole group receives the cost, but
6876       // the cost will actually be assigned to one instruction.
6877       if (auto Group = getInterleavedAccessGroup(&I))
6878         setWideningDecision(Group, VF, Decision, Cost);
6879       else
6880         setWideningDecision(&I, VF, Decision, Cost);
6881     }
6882   }
6883 
6884   // Make sure that any load of address and any other address computation
6885   // remains scalar unless there is gather/scatter support. This avoids
6886   // inevitable extracts into address registers, and also has the benefit of
6887   // activating LSR more, since that pass can't optimize vectorized
6888   // addresses.
6889   if (TTI.prefersVectorizedAddressing())
6890     return;
6891 
6892   // Start with all scalar pointer uses.
6893   SmallPtrSet<Instruction *, 8> AddrDefs;
6894   for (BasicBlock *BB : TheLoop->blocks())
6895     for (Instruction &I : *BB) {
6896       Instruction *PtrDef =
6897         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6898       if (PtrDef && TheLoop->contains(PtrDef) &&
6899           getWideningDecision(&I, VF) != CM_GatherScatter)
6900         AddrDefs.insert(PtrDef);
6901     }
6902 
6903   // Add all instructions used to generate the addresses.
6904   SmallVector<Instruction *, 4> Worklist;
6905   append_range(Worklist, AddrDefs);
6906   while (!Worklist.empty()) {
6907     Instruction *I = Worklist.pop_back_val();
6908     for (auto &Op : I->operands())
6909       if (auto *InstOp = dyn_cast<Instruction>(Op))
6910         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6911             AddrDefs.insert(InstOp).second)
6912           Worklist.push_back(InstOp);
6913   }
6914 
6915   for (auto *I : AddrDefs) {
6916     if (isa<LoadInst>(I)) {
6917       // Setting the desired widening decision should ideally be handled in
6918       // by cost functions, but since this involves the task of finding out
6919       // if the loaded register is involved in an address computation, it is
6920       // instead changed here when we know this is the case.
6921       InstWidening Decision = getWideningDecision(I, VF);
6922       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6923         // Scalarize a widened load of address.
6924         setWideningDecision(
6925             I, VF, CM_Scalarize,
6926             (VF.getKnownMinValue() *
6927              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6928       else if (auto Group = getInterleavedAccessGroup(I)) {
6929         // Scalarize an interleave group of address loads.
6930         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6931           if (Instruction *Member = Group->getMember(I))
6932             setWideningDecision(
6933                 Member, VF, CM_Scalarize,
6934                 (VF.getKnownMinValue() *
6935                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6936         }
6937       }
6938     } else
6939       // Make sure I gets scalarized and a cost estimate without
6940       // scalarization overhead.
6941       ForcedScalars[VF].insert(I);
6942   }
6943 }
6944 
6945 InstructionCost
6946 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
6947                                                Type *&VectorTy) {
6948   Type *RetTy = I->getType();
6949   if (canTruncateToMinimalBitwidth(I, VF))
6950     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6951   auto SE = PSE.getSE();
6952   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6953 
6954   auto hasSingleCopyAfterVectorization = [this](Instruction *I,
6955                                                 ElementCount VF) -> bool {
6956     if (VF.isScalar())
6957       return true;
6958 
6959     auto Scalarized = InstsToScalarize.find(VF);
6960     assert(Scalarized != InstsToScalarize.end() &&
6961            "VF not yet analyzed for scalarization profitability");
6962     return !Scalarized->second.count(I) &&
6963            llvm::all_of(I->users(), [&](User *U) {
6964              auto *UI = cast<Instruction>(U);
6965              return !Scalarized->second.count(UI);
6966            });
6967   };
6968   (void) hasSingleCopyAfterVectorization;
6969 
6970   if (isScalarAfterVectorization(I, VF)) {
6971     // With the exception of GEPs and PHIs, after scalarization there should
6972     // only be one copy of the instruction generated in the loop. This is
6973     // because the VF is either 1, or any instructions that need scalarizing
6974     // have already been dealt with by the the time we get here. As a result,
6975     // it means we don't have to multiply the instruction cost by VF.
6976     assert(I->getOpcode() == Instruction::GetElementPtr ||
6977            I->getOpcode() == Instruction::PHI ||
6978            (I->getOpcode() == Instruction::BitCast &&
6979             I->getType()->isPointerTy()) ||
6980            hasSingleCopyAfterVectorization(I, VF));
6981     VectorTy = RetTy;
6982   } else
6983     VectorTy = ToVectorTy(RetTy, VF);
6984 
6985   // TODO: We need to estimate the cost of intrinsic calls.
6986   switch (I->getOpcode()) {
6987   case Instruction::GetElementPtr:
6988     // We mark this instruction as zero-cost because the cost of GEPs in
6989     // vectorized code depends on whether the corresponding memory instruction
6990     // is scalarized or not. Therefore, we handle GEPs with the memory
6991     // instruction cost.
6992     return 0;
6993   case Instruction::Br: {
6994     // In cases of scalarized and predicated instructions, there will be VF
6995     // predicated blocks in the vectorized loop. Each branch around these
6996     // blocks requires also an extract of its vector compare i1 element.
6997     bool ScalarPredicatedBB = false;
6998     BranchInst *BI = cast<BranchInst>(I);
6999     if (VF.isVector() && BI->isConditional() &&
7000         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
7001          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
7002       ScalarPredicatedBB = true;
7003 
7004     if (ScalarPredicatedBB) {
7005       // Not possible to scalarize scalable vector with predicated instructions.
7006       if (VF.isScalable())
7007         return InstructionCost::getInvalid();
7008       // Return cost for branches around scalarized and predicated blocks.
7009       auto *Vec_i1Ty =
7010           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
7011       return (
7012           TTI.getScalarizationOverhead(
7013               Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) +
7014           (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
7015     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
7016       // The back-edge branch will remain, as will all scalar branches.
7017       return TTI.getCFInstrCost(Instruction::Br, CostKind);
7018     else
7019       // This branch will be eliminated by if-conversion.
7020       return 0;
7021     // Note: We currently assume zero cost for an unconditional branch inside
7022     // a predicated block since it will become a fall-through, although we
7023     // may decide in the future to call TTI for all branches.
7024   }
7025   case Instruction::PHI: {
7026     auto *Phi = cast<PHINode>(I);
7027 
7028     // First-order recurrences are replaced by vector shuffles inside the loop.
7029     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
7030     if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
7031       return TTI.getShuffleCost(
7032           TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
7033           None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
7034 
7035     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
7036     // converted into select instructions. We require N - 1 selects per phi
7037     // node, where N is the number of incoming values.
7038     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
7039       return (Phi->getNumIncomingValues() - 1) *
7040              TTI.getCmpSelInstrCost(
7041                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
7042                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
7043                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
7044 
7045     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
7046   }
7047   case Instruction::UDiv:
7048   case Instruction::SDiv:
7049   case Instruction::URem:
7050   case Instruction::SRem:
7051     // If we have a predicated instruction, it may not be executed for each
7052     // vector lane. Get the scalarization cost and scale this amount by the
7053     // probability of executing the predicated block. If the instruction is not
7054     // predicated, we fall through to the next case.
7055     if (VF.isVector() && isScalarWithPredication(I, VF)) {
7056       InstructionCost Cost = 0;
7057 
7058       // These instructions have a non-void type, so account for the phi nodes
7059       // that we will create. This cost is likely to be zero. The phi node
7060       // cost, if any, should be scaled by the block probability because it
7061       // models a copy at the end of each predicated block.
7062       Cost += VF.getKnownMinValue() *
7063               TTI.getCFInstrCost(Instruction::PHI, CostKind);
7064 
7065       // The cost of the non-predicated instruction.
7066       Cost += VF.getKnownMinValue() *
7067               TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
7068 
7069       // The cost of insertelement and extractelement instructions needed for
7070       // scalarization.
7071       Cost += getScalarizationOverhead(I, VF);
7072 
7073       // Scale the cost by the probability of executing the predicated blocks.
7074       // This assumes the predicated block for each vector lane is equally
7075       // likely.
7076       return Cost / getReciprocalPredBlockProb();
7077     }
7078     LLVM_FALLTHROUGH;
7079   case Instruction::Add:
7080   case Instruction::FAdd:
7081   case Instruction::Sub:
7082   case Instruction::FSub:
7083   case Instruction::Mul:
7084   case Instruction::FMul:
7085   case Instruction::FDiv:
7086   case Instruction::FRem:
7087   case Instruction::Shl:
7088   case Instruction::LShr:
7089   case Instruction::AShr:
7090   case Instruction::And:
7091   case Instruction::Or:
7092   case Instruction::Xor: {
7093     // Since we will replace the stride by 1 the multiplication should go away.
7094     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7095       return 0;
7096 
7097     // Detect reduction patterns
7098     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7099       return *RedCost;
7100 
7101     // Certain instructions can be cheaper to vectorize if they have a constant
7102     // second vector operand. One example of this are shifts on x86.
7103     Value *Op2 = I->getOperand(1);
7104     TargetTransformInfo::OperandValueProperties Op2VP;
7105     TargetTransformInfo::OperandValueKind Op2VK =
7106         TTI.getOperandInfo(Op2, Op2VP);
7107     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
7108       Op2VK = TargetTransformInfo::OK_UniformValue;
7109 
7110     SmallVector<const Value *, 4> Operands(I->operand_values());
7111     return TTI.getArithmeticInstrCost(
7112         I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7113         Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
7114   }
7115   case Instruction::FNeg: {
7116     return TTI.getArithmeticInstrCost(
7117         I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7118         TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None,
7119         TargetTransformInfo::OP_None, I->getOperand(0), I);
7120   }
7121   case Instruction::Select: {
7122     SelectInst *SI = cast<SelectInst>(I);
7123     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7124     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7125 
7126     const Value *Op0, *Op1;
7127     using namespace llvm::PatternMatch;
7128     if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
7129                         match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
7130       // select x, y, false --> x & y
7131       // select x, true, y --> x | y
7132       TTI::OperandValueProperties Op1VP = TTI::OP_None;
7133       TTI::OperandValueProperties Op2VP = TTI::OP_None;
7134       TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP);
7135       TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP);
7136       assert(Op0->getType()->getScalarSizeInBits() == 1 &&
7137               Op1->getType()->getScalarSizeInBits() == 1);
7138 
7139       SmallVector<const Value *, 2> Operands{Op0, Op1};
7140       return TTI.getArithmeticInstrCost(
7141           match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
7142           CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I);
7143     }
7144 
7145     Type *CondTy = SI->getCondition()->getType();
7146     if (!ScalarCond)
7147       CondTy = VectorType::get(CondTy, VF);
7148 
7149     CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
7150     if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
7151       Pred = Cmp->getPredicate();
7152     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
7153                                   CostKind, I);
7154   }
7155   case Instruction::ICmp:
7156   case Instruction::FCmp: {
7157     Type *ValTy = I->getOperand(0)->getType();
7158     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7159     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7160       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7161     VectorTy = ToVectorTy(ValTy, VF);
7162     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7163                                   cast<CmpInst>(I)->getPredicate(), CostKind,
7164                                   I);
7165   }
7166   case Instruction::Store:
7167   case Instruction::Load: {
7168     ElementCount Width = VF;
7169     if (Width.isVector()) {
7170       InstWidening Decision = getWideningDecision(I, Width);
7171       assert(Decision != CM_Unknown &&
7172              "CM decision should be taken at this point");
7173       if (Decision == CM_Scalarize)
7174         Width = ElementCount::getFixed(1);
7175     }
7176     VectorTy = ToVectorTy(getLoadStoreType(I), Width);
7177     return getMemoryInstructionCost(I, VF);
7178   }
7179   case Instruction::BitCast:
7180     if (I->getType()->isPointerTy())
7181       return 0;
7182     LLVM_FALLTHROUGH;
7183   case Instruction::ZExt:
7184   case Instruction::SExt:
7185   case Instruction::FPToUI:
7186   case Instruction::FPToSI:
7187   case Instruction::FPExt:
7188   case Instruction::PtrToInt:
7189   case Instruction::IntToPtr:
7190   case Instruction::SIToFP:
7191   case Instruction::UIToFP:
7192   case Instruction::Trunc:
7193   case Instruction::FPTrunc: {
7194     // Computes the CastContextHint from a Load/Store instruction.
7195     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7196       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7197              "Expected a load or a store!");
7198 
7199       if (VF.isScalar() || !TheLoop->contains(I))
7200         return TTI::CastContextHint::Normal;
7201 
7202       switch (getWideningDecision(I, VF)) {
7203       case LoopVectorizationCostModel::CM_GatherScatter:
7204         return TTI::CastContextHint::GatherScatter;
7205       case LoopVectorizationCostModel::CM_Interleave:
7206         return TTI::CastContextHint::Interleave;
7207       case LoopVectorizationCostModel::CM_Scalarize:
7208       case LoopVectorizationCostModel::CM_Widen:
7209         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7210                                         : TTI::CastContextHint::Normal;
7211       case LoopVectorizationCostModel::CM_Widen_Reverse:
7212         return TTI::CastContextHint::Reversed;
7213       case LoopVectorizationCostModel::CM_Unknown:
7214         llvm_unreachable("Instr did not go through cost modelling?");
7215       }
7216 
7217       llvm_unreachable("Unhandled case!");
7218     };
7219 
7220     unsigned Opcode = I->getOpcode();
7221     TTI::CastContextHint CCH = TTI::CastContextHint::None;
7222     // For Trunc, the context is the only user, which must be a StoreInst.
7223     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7224       if (I->hasOneUse())
7225         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7226           CCH = ComputeCCH(Store);
7227     }
7228     // For Z/Sext, the context is the operand, which must be a LoadInst.
7229     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7230              Opcode == Instruction::FPExt) {
7231       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7232         CCH = ComputeCCH(Load);
7233     }
7234 
7235     // We optimize the truncation of induction variables having constant
7236     // integer steps. The cost of these truncations is the same as the scalar
7237     // operation.
7238     if (isOptimizableIVTruncate(I, VF)) {
7239       auto *Trunc = cast<TruncInst>(I);
7240       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7241                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
7242     }
7243 
7244     // Detect reduction patterns
7245     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7246       return *RedCost;
7247 
7248     Type *SrcScalarTy = I->getOperand(0)->getType();
7249     Type *SrcVecTy =
7250         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7251     if (canTruncateToMinimalBitwidth(I, VF)) {
7252       // This cast is going to be shrunk. This may remove the cast or it might
7253       // turn it into slightly different cast. For example, if MinBW == 16,
7254       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7255       //
7256       // Calculate the modified src and dest types.
7257       Type *MinVecTy = VectorTy;
7258       if (Opcode == Instruction::Trunc) {
7259         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7260         VectorTy =
7261             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7262       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7263         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7264         VectorTy =
7265             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7266       }
7267     }
7268 
7269     return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7270   }
7271   case Instruction::Call: {
7272     if (RecurrenceDescriptor::isFMulAddIntrinsic(I))
7273       if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7274         return *RedCost;
7275     bool NeedToScalarize;
7276     CallInst *CI = cast<CallInst>(I);
7277     InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
7278     if (getVectorIntrinsicIDForCall(CI, TLI)) {
7279       InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
7280       return std::min(CallCost, IntrinsicCost);
7281     }
7282     return CallCost;
7283   }
7284   case Instruction::ExtractValue:
7285     return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7286   case Instruction::Alloca:
7287     // We cannot easily widen alloca to a scalable alloca, as
7288     // the result would need to be a vector of pointers.
7289     if (VF.isScalable())
7290       return InstructionCost::getInvalid();
7291     LLVM_FALLTHROUGH;
7292   default:
7293     // This opcode is unknown. Assume that it is the same as 'mul'.
7294     return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7295   } // end of switch.
7296 }
7297 
7298 char LoopVectorize::ID = 0;
7299 
7300 static const char lv_name[] = "Loop Vectorization";
7301 
7302 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
7303 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
7304 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
7305 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
7306 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
7307 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
7308 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
7309 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
7310 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
7311 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
7312 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
7313 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
7314 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
7315 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
7316 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
7317 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
7318 
7319 namespace llvm {
7320 
7321 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
7322 
7323 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
7324                               bool VectorizeOnlyWhenForced) {
7325   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
7326 }
7327 
7328 } // end namespace llvm
7329 
7330 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
7331   // Check if the pointer operand of a load or store instruction is
7332   // consecutive.
7333   if (auto *Ptr = getLoadStorePointerOperand(Inst))
7334     return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr);
7335   return false;
7336 }
7337 
7338 void LoopVectorizationCostModel::collectValuesToIgnore() {
7339   // Ignore ephemeral values.
7340   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7341 
7342   // Find all stores to invariant variables. Since they are going to sink
7343   // outside the loop we do not need calculate cost for them.
7344   for (BasicBlock *BB : TheLoop->blocks())
7345     for (Instruction &I : *BB) {
7346       StoreInst *SI;
7347       if ((SI = dyn_cast<StoreInst>(&I)) &&
7348           Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
7349         ValuesToIgnore.insert(&I);
7350     }
7351 
7352   // Ignore type-promoting instructions we identified during reduction
7353   // detection.
7354   for (auto &Reduction : Legal->getReductionVars()) {
7355     const RecurrenceDescriptor &RedDes = Reduction.second;
7356     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7357     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7358   }
7359   // Ignore type-casting instructions we identified during induction
7360   // detection.
7361   for (auto &Induction : Legal->getInductionVars()) {
7362     const InductionDescriptor &IndDes = Induction.second;
7363     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7364     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7365   }
7366 }
7367 
7368 void LoopVectorizationCostModel::collectInLoopReductions() {
7369   for (auto &Reduction : Legal->getReductionVars()) {
7370     PHINode *Phi = Reduction.first;
7371     const RecurrenceDescriptor &RdxDesc = Reduction.second;
7372 
7373     // We don't collect reductions that are type promoted (yet).
7374     if (RdxDesc.getRecurrenceType() != Phi->getType())
7375       continue;
7376 
7377     // If the target would prefer this reduction to happen "in-loop", then we
7378     // want to record it as such.
7379     unsigned Opcode = RdxDesc.getOpcode();
7380     if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7381         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7382                                    TargetTransformInfo::ReductionFlags()))
7383       continue;
7384 
7385     // Check that we can correctly put the reductions into the loop, by
7386     // finding the chain of operations that leads from the phi to the loop
7387     // exit value.
7388     SmallVector<Instruction *, 4> ReductionOperations =
7389         RdxDesc.getReductionOpChain(Phi, TheLoop);
7390     bool InLoop = !ReductionOperations.empty();
7391     if (InLoop) {
7392       InLoopReductionChains[Phi] = ReductionOperations;
7393       // Add the elements to InLoopReductionImmediateChains for cost modelling.
7394       Instruction *LastChain = Phi;
7395       for (auto *I : ReductionOperations) {
7396         InLoopReductionImmediateChains[I] = LastChain;
7397         LastChain = I;
7398       }
7399     }
7400     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7401                       << " reduction for phi: " << *Phi << "\n");
7402   }
7403 }
7404 
7405 // TODO: we could return a pair of values that specify the max VF and
7406 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7407 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7408 // doesn't have a cost model that can choose which plan to execute if
7409 // more than one is generated.
7410 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
7411                                  LoopVectorizationCostModel &CM) {
7412   unsigned WidestType;
7413   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7414   return WidestVectorRegBits / WidestType;
7415 }
7416 
7417 VectorizationFactor
7418 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7419   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
7420   ElementCount VF = UserVF;
7421   // Outer loop handling: They may require CFG and instruction level
7422   // transformations before even evaluating whether vectorization is profitable.
7423   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7424   // the vectorization pipeline.
7425   if (!OrigLoop->isInnermost()) {
7426     // If the user doesn't provide a vectorization factor, determine a
7427     // reasonable one.
7428     if (UserVF.isZero()) {
7429       VF = ElementCount::getFixed(determineVPlanVF(
7430           TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
7431               .getFixedSize(),
7432           CM));
7433       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7434 
7435       // Make sure we have a VF > 1 for stress testing.
7436       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7437         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7438                           << "overriding computed VF.\n");
7439         VF = ElementCount::getFixed(4);
7440       }
7441     }
7442     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7443     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7444            "VF needs to be a power of two");
7445     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7446                       << "VF " << VF << " to build VPlans.\n");
7447     buildVPlans(VF, VF);
7448 
7449     // For VPlan build stress testing, we bail out after VPlan construction.
7450     if (VPlanBuildStressTest)
7451       return VectorizationFactor::Disabled();
7452 
7453     return {VF, 0 /*Cost*/};
7454   }
7455 
7456   LLVM_DEBUG(
7457       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7458                 "VPlan-native path.\n");
7459   return VectorizationFactor::Disabled();
7460 }
7461 
7462 bool LoopVectorizationPlanner::requiresTooManyRuntimeChecks() const {
7463   unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks();
7464   return (NumRuntimePointerChecks >
7465               VectorizerParams::RuntimeMemoryCheckThreshold &&
7466           !Hints.allowReordering()) ||
7467          NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
7468 }
7469 
7470 Optional<VectorizationFactor>
7471 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7472   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7473   FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7474   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7475     return None;
7476 
7477   // Invalidate interleave groups if all blocks of loop will be predicated.
7478   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7479       !useMaskedInterleavedAccesses(*TTI)) {
7480     LLVM_DEBUG(
7481         dbgs()
7482         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7483            "which requires masked-interleaved support.\n");
7484     if (CM.InterleaveInfo.invalidateGroups())
7485       // Invalidating interleave groups also requires invalidating all decisions
7486       // based on them, which includes widening decisions and uniform and scalar
7487       // values.
7488       CM.invalidateCostModelingDecisions();
7489   }
7490 
7491   ElementCount MaxUserVF =
7492       UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7493   bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
7494   if (!UserVF.isZero() && UserVFIsLegal) {
7495     assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
7496            "VF needs to be a power of two");
7497     // Collect the instructions (and their associated costs) that will be more
7498     // profitable to scalarize.
7499     if (CM.selectUserVectorizationFactor(UserVF)) {
7500       LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7501       CM.collectInLoopReductions();
7502       buildVPlansWithVPRecipes(UserVF, UserVF);
7503       LLVM_DEBUG(printPlans(dbgs()));
7504       return {{UserVF, 0}};
7505     } else
7506       reportVectorizationInfo("UserVF ignored because of invalid costs.",
7507                               "InvalidCost", ORE, OrigLoop);
7508   }
7509 
7510   // Populate the set of Vectorization Factor Candidates.
7511   ElementCountSet VFCandidates;
7512   for (auto VF = ElementCount::getFixed(1);
7513        ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7514     VFCandidates.insert(VF);
7515   for (auto VF = ElementCount::getScalable(1);
7516        ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7517     VFCandidates.insert(VF);
7518 
7519   for (const auto &VF : VFCandidates) {
7520     // Collect Uniform and Scalar instructions after vectorization with VF.
7521     CM.collectUniformsAndScalars(VF);
7522 
7523     // Collect the instructions (and their associated costs) that will be more
7524     // profitable to scalarize.
7525     if (VF.isVector())
7526       CM.collectInstsToScalarize(VF);
7527   }
7528 
7529   CM.collectInLoopReductions();
7530   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7531   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7532 
7533   LLVM_DEBUG(printPlans(dbgs()));
7534   if (!MaxFactors.hasVector())
7535     return VectorizationFactor::Disabled();
7536 
7537   // Select the optimal vectorization factor.
7538   return CM.selectVectorizationFactor(VFCandidates);
7539 }
7540 
7541 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
7542   assert(count_if(VPlans,
7543                   [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
7544              1 &&
7545          "Best VF has not a single VPlan.");
7546 
7547   for (const VPlanPtr &Plan : VPlans) {
7548     if (Plan->hasVF(VF))
7549       return *Plan.get();
7550   }
7551   llvm_unreachable("No plan found!");
7552 }
7553 
7554 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7555   SmallVector<Metadata *, 4> MDs;
7556   // Reserve first location for self reference to the LoopID metadata node.
7557   MDs.push_back(nullptr);
7558   bool IsUnrollMetadata = false;
7559   MDNode *LoopID = L->getLoopID();
7560   if (LoopID) {
7561     // First find existing loop unrolling disable metadata.
7562     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7563       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7564       if (MD) {
7565         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7566         IsUnrollMetadata =
7567             S && S->getString().startswith("llvm.loop.unroll.disable");
7568       }
7569       MDs.push_back(LoopID->getOperand(i));
7570     }
7571   }
7572 
7573   if (!IsUnrollMetadata) {
7574     // Add runtime unroll disable metadata.
7575     LLVMContext &Context = L->getHeader()->getContext();
7576     SmallVector<Metadata *, 1> DisableOperands;
7577     DisableOperands.push_back(
7578         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7579     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7580     MDs.push_back(DisableNode);
7581     MDNode *NewLoopID = MDNode::get(Context, MDs);
7582     // Set operand 0 to refer to the loop id itself.
7583     NewLoopID->replaceOperandWith(0, NewLoopID);
7584     L->setLoopID(NewLoopID);
7585   }
7586 }
7587 
7588 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
7589                                            VPlan &BestVPlan,
7590                                            InnerLoopVectorizer &ILV,
7591                                            DominatorTree *DT) {
7592   LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
7593                     << '\n');
7594 
7595   // Perform the actual loop transformation.
7596 
7597   // 1. Set up the skeleton for vectorization, including vector pre-header and
7598   // middle block. The vector loop is created during VPlan execution.
7599   VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};
7600   Value *CanonicalIVStartValue;
7601   std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
7602       ILV.createVectorizedLoopSkeleton();
7603   ILV.collectPoisonGeneratingRecipes(State);
7604 
7605   ILV.printDebugTracesAtStart();
7606 
7607   //===------------------------------------------------===//
7608   //
7609   // Notice: any optimization or new instruction that go
7610   // into the code below should also be implemented in
7611   // the cost-model.
7612   //
7613   //===------------------------------------------------===//
7614 
7615   // 2. Copy and widen instructions from the old loop into the new loop.
7616   BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr),
7617                              ILV.getOrCreateVectorTripCount(nullptr),
7618                              CanonicalIVStartValue, State);
7619   BestVPlan.execute(&State);
7620 
7621   // Keep all loop hints from the original loop on the vector loop (we'll
7622   // replace the vectorizer-specific hints below).
7623   MDNode *OrigLoopID = OrigLoop->getLoopID();
7624 
7625   Optional<MDNode *> VectorizedLoopID =
7626       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7627                                       LLVMLoopVectorizeFollowupVectorized});
7628 
7629   VPBasicBlock *HeaderVPBB =
7630       BestVPlan.getVectorLoopRegion()->getEntryBasicBlock();
7631   Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
7632   if (VectorizedLoopID.hasValue())
7633     L->setLoopID(VectorizedLoopID.getValue());
7634   else {
7635     // Keep all loop hints from the original loop on the vector loop (we'll
7636     // replace the vectorizer-specific hints below).
7637     if (MDNode *LID = OrigLoop->getLoopID())
7638       L->setLoopID(LID);
7639 
7640     LoopVectorizeHints Hints(L, true, *ORE);
7641     Hints.setAlreadyVectorized();
7642   }
7643   // Disable runtime unrolling when vectorizing the epilogue loop.
7644   if (CanonicalIVStartValue)
7645     AddRuntimeUnrollDisableMetaData(L);
7646 
7647   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7648   //    predication, updating analyses.
7649   ILV.fixVectorizedLoop(State, BestVPlan);
7650 
7651   ILV.printDebugTracesAtEnd();
7652 }
7653 
7654 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
7655 void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
7656   for (const auto &Plan : VPlans)
7657     if (PrintVPlansInDotFormat)
7658       Plan->printDOT(O);
7659     else
7660       Plan->print(O);
7661 }
7662 #endif
7663 
7664 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
7665     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7666 
7667   // We create new control-flow for the vectorized loop, so the original exit
7668   // conditions will be dead after vectorization if it's only used by the
7669   // terminator
7670   SmallVector<BasicBlock*> ExitingBlocks;
7671   OrigLoop->getExitingBlocks(ExitingBlocks);
7672   for (auto *BB : ExitingBlocks) {
7673     auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0));
7674     if (!Cmp || !Cmp->hasOneUse())
7675       continue;
7676 
7677     // TODO: we should introduce a getUniqueExitingBlocks on Loop
7678     if (!DeadInstructions.insert(Cmp).second)
7679       continue;
7680 
7681     // The operands of the icmp is often a dead trunc, used by IndUpdate.
7682     // TODO: can recurse through operands in general
7683     for (Value *Op : Cmp->operands()) {
7684       if (isa<TruncInst>(Op) && Op->hasOneUse())
7685           DeadInstructions.insert(cast<Instruction>(Op));
7686     }
7687   }
7688 
7689   // We create new "steps" for induction variable updates to which the original
7690   // induction variables map. An original update instruction will be dead if
7691   // all its users except the induction variable are dead.
7692   auto *Latch = OrigLoop->getLoopLatch();
7693   for (auto &Induction : Legal->getInductionVars()) {
7694     PHINode *Ind = Induction.first;
7695     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
7696 
7697     // If the tail is to be folded by masking, the primary induction variable,
7698     // if exists, isn't dead: it will be used for masking. Don't kill it.
7699     if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())
7700       continue;
7701 
7702     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
7703           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
7704         }))
7705       DeadInstructions.insert(IndUpdate);
7706   }
7707 }
7708 
7709 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
7710 
7711 //===--------------------------------------------------------------------===//
7712 // EpilogueVectorizerMainLoop
7713 //===--------------------------------------------------------------------===//
7714 
7715 /// This function is partially responsible for generating the control flow
7716 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7717 std::pair<BasicBlock *, Value *>
7718 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
7719   MDNode *OrigLoopID = OrigLoop->getLoopID();
7720 
7721   // Workaround!  Compute the trip count of the original loop and cache it
7722   // before we start modifying the CFG.  This code has a systemic problem
7723   // wherein it tries to run analysis over partially constructed IR; this is
7724   // wrong, and not simply for SCEV.  The trip count of the original loop
7725   // simply happens to be prone to hitting this in practice.  In theory, we
7726   // can hit the same issue for any SCEV, or ValueTracking query done during
7727   // mutation.  See PR49900.
7728   getOrCreateTripCount(OrigLoop->getLoopPreheader());
7729   createVectorLoopSkeleton("");
7730 
7731   // Generate the code to check the minimum iteration count of the vector
7732   // epilogue (see below).
7733   EPI.EpilogueIterationCountCheck =
7734       emitIterationCountCheck(LoopScalarPreHeader, true);
7735   EPI.EpilogueIterationCountCheck->setName("iter.check");
7736 
7737   // Generate the code to check any assumptions that we've made for SCEV
7738   // expressions.
7739   EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader);
7740 
7741   // Generate the code that checks at runtime if arrays overlap. We put the
7742   // checks into a separate block to make the more common case of few elements
7743   // faster.
7744   EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader);
7745 
7746   // Generate the iteration count check for the main loop, *after* the check
7747   // for the epilogue loop, so that the path-length is shorter for the case
7748   // that goes directly through the vector epilogue. The longer-path length for
7749   // the main loop is compensated for, by the gain from vectorizing the larger
7750   // trip count. Note: the branch will get updated later on when we vectorize
7751   // the epilogue.
7752   EPI.MainLoopIterationCountCheck =
7753       emitIterationCountCheck(LoopScalarPreHeader, false);
7754 
7755   // Generate the induction variable.
7756   EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
7757 
7758   // Skip induction resume value creation here because they will be created in
7759   // the second pass. If we created them here, they wouldn't be used anyway,
7760   // because the vplan in the second pass still contains the inductions from the
7761   // original loop.
7762 
7763   return {completeLoopSkeleton(OrigLoopID), nullptr};
7764 }
7765 
7766 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7767   LLVM_DEBUG({
7768     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7769            << "Main Loop VF:" << EPI.MainLoopVF
7770            << ", Main Loop UF:" << EPI.MainLoopUF
7771            << ", Epilogue Loop VF:" << EPI.EpilogueVF
7772            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7773   });
7774 }
7775 
7776 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7777   DEBUG_WITH_TYPE(VerboseDebug, {
7778     dbgs() << "intermediate fn:\n"
7779            << *OrigLoop->getHeader()->getParent() << "\n";
7780   });
7781 }
7782 
7783 BasicBlock *
7784 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
7785                                                     bool ForEpilogue) {
7786   assert(Bypass && "Expected valid bypass basic block.");
7787   ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7788   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7789   Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
7790   // Reuse existing vector loop preheader for TC checks.
7791   // Note that new preheader block is generated for vector loop.
7792   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7793   IRBuilder<> Builder(TCCheckBlock->getTerminator());
7794 
7795   // Generate code to check if the loop's trip count is less than VF * UF of the
7796   // main vector loop.
7797   auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ?
7798       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7799 
7800   Value *CheckMinIters = Builder.CreateICmp(
7801       P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
7802       "min.iters.check");
7803 
7804   if (!ForEpilogue)
7805     TCCheckBlock->setName("vector.main.loop.iter.check");
7806 
7807   // Create new preheader for vector loop.
7808   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7809                                    DT, LI, nullptr, "vector.ph");
7810 
7811   if (ForEpilogue) {
7812     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7813                                  DT->getNode(Bypass)->getIDom()) &&
7814            "TC check is expected to dominate Bypass");
7815 
7816     // Update dominator for Bypass & LoopExit.
7817     DT->changeImmediateDominator(Bypass, TCCheckBlock);
7818     if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
7819       // For loops with multiple exits, there's no edge from the middle block
7820       // to exit blocks (as the epilogue must run) and thus no need to update
7821       // the immediate dominator of the exit blocks.
7822       DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
7823 
7824     LoopBypassBlocks.push_back(TCCheckBlock);
7825 
7826     // Save the trip count so we don't have to regenerate it in the
7827     // vec.epilog.iter.check. This is safe to do because the trip count
7828     // generated here dominates the vector epilog iter check.
7829     EPI.TripCount = Count;
7830   }
7831 
7832   ReplaceInstWithInst(
7833       TCCheckBlock->getTerminator(),
7834       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7835 
7836   return TCCheckBlock;
7837 }
7838 
7839 //===--------------------------------------------------------------------===//
7840 // EpilogueVectorizerEpilogueLoop
7841 //===--------------------------------------------------------------------===//
7842 
7843 /// This function is partially responsible for generating the control flow
7844 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7845 std::pair<BasicBlock *, Value *>
7846 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
7847   MDNode *OrigLoopID = OrigLoop->getLoopID();
7848   createVectorLoopSkeleton("vec.epilog.");
7849 
7850   // Now, compare the remaining count and if there aren't enough iterations to
7851   // execute the vectorized epilogue skip to the scalar part.
7852   BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
7853   VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
7854   LoopVectorPreHeader =
7855       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
7856                  LI, nullptr, "vec.epilog.ph");
7857   emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
7858                                           VecEpilogueIterationCountCheck);
7859 
7860   // Adjust the control flow taking the state info from the main loop
7861   // vectorization into account.
7862   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7863          "expected this to be saved from the previous pass.");
7864   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7865       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7866 
7867   DT->changeImmediateDominator(LoopVectorPreHeader,
7868                                EPI.MainLoopIterationCountCheck);
7869 
7870   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7871       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7872 
7873   if (EPI.SCEVSafetyCheck)
7874     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
7875         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7876   if (EPI.MemSafetyCheck)
7877     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
7878         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7879 
7880   DT->changeImmediateDominator(
7881       VecEpilogueIterationCountCheck,
7882       VecEpilogueIterationCountCheck->getSinglePredecessor());
7883 
7884   DT->changeImmediateDominator(LoopScalarPreHeader,
7885                                EPI.EpilogueIterationCountCheck);
7886   if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
7887     // If there is an epilogue which must run, there's no edge from the
7888     // middle block to exit blocks  and thus no need to update the immediate
7889     // dominator of the exit blocks.
7890     DT->changeImmediateDominator(LoopExitBlock,
7891                                  EPI.EpilogueIterationCountCheck);
7892 
7893   // Keep track of bypass blocks, as they feed start values to the induction
7894   // phis in the scalar loop preheader.
7895   if (EPI.SCEVSafetyCheck)
7896     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
7897   if (EPI.MemSafetyCheck)
7898     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
7899   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
7900 
7901   // The vec.epilog.iter.check block may contain Phi nodes from reductions which
7902   // merge control-flow from the latch block and the middle block. Update the
7903   // incoming values here and move the Phi into the preheader.
7904   SmallVector<PHINode *, 4> PhisInBlock;
7905   for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
7906     PhisInBlock.push_back(&Phi);
7907 
7908   for (PHINode *Phi : PhisInBlock) {
7909     Phi->replaceIncomingBlockWith(
7910         VecEpilogueIterationCountCheck->getSinglePredecessor(),
7911         VecEpilogueIterationCountCheck);
7912     Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
7913     if (EPI.SCEVSafetyCheck)
7914       Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
7915     if (EPI.MemSafetyCheck)
7916       Phi->removeIncomingValue(EPI.MemSafetyCheck);
7917     Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
7918   }
7919 
7920   // Generate a resume induction for the vector epilogue and put it in the
7921   // vector epilogue preheader
7922   Type *IdxTy = Legal->getWidestInductionType();
7923   PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
7924                                          LoopVectorPreHeader->getFirstNonPHI());
7925   EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
7926   EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
7927                            EPI.MainLoopIterationCountCheck);
7928 
7929   // Generate induction resume values. These variables save the new starting
7930   // indexes for the scalar loop. They are used to test if there are any tail
7931   // iterations left once the vector loop has completed.
7932   // Note that when the vectorized epilogue is skipped due to iteration count
7933   // check, then the resume value for the induction variable comes from
7934   // the trip count of the main vector loop, hence passing the AdditionalBypass
7935   // argument.
7936   createInductionResumeValues({VecEpilogueIterationCountCheck,
7937                                EPI.VectorTripCount} /* AdditionalBypass */);
7938 
7939   return {completeLoopSkeleton(OrigLoopID), EPResumeVal};
7940 }
7941 
7942 BasicBlock *
7943 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
7944     BasicBlock *Bypass, BasicBlock *Insert) {
7945 
7946   assert(EPI.TripCount &&
7947          "Expected trip count to have been safed in the first pass.");
7948   assert(
7949       (!isa<Instruction>(EPI.TripCount) ||
7950        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
7951       "saved trip count does not dominate insertion point.");
7952   Value *TC = EPI.TripCount;
7953   IRBuilder<> Builder(Insert->getTerminator());
7954   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
7955 
7956   // Generate code to check if the loop's trip count is less than VF * UF of the
7957   // vector epilogue loop.
7958   auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ?
7959       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7960 
7961   Value *CheckMinIters =
7962       Builder.CreateICmp(P, Count,
7963                          createStepForVF(Builder, Count->getType(),
7964                                          EPI.EpilogueVF, EPI.EpilogueUF),
7965                          "min.epilog.iters.check");
7966 
7967   ReplaceInstWithInst(
7968       Insert->getTerminator(),
7969       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7970 
7971   LoopBypassBlocks.push_back(Insert);
7972   return Insert;
7973 }
7974 
7975 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
7976   LLVM_DEBUG({
7977     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7978            << "Epilogue Loop VF:" << EPI.EpilogueVF
7979            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7980   });
7981 }
7982 
7983 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
7984   DEBUG_WITH_TYPE(VerboseDebug, {
7985     dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
7986   });
7987 }
7988 
7989 bool LoopVectorizationPlanner::getDecisionAndClampRange(
7990     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
7991   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
7992   bool PredicateAtRangeStart = Predicate(Range.Start);
7993 
7994   for (ElementCount TmpVF = Range.Start * 2;
7995        ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
7996     if (Predicate(TmpVF) != PredicateAtRangeStart) {
7997       Range.End = TmpVF;
7998       break;
7999     }
8000 
8001   return PredicateAtRangeStart;
8002 }
8003 
8004 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
8005 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
8006 /// of VF's starting at a given VF and extending it as much as possible. Each
8007 /// vectorization decision can potentially shorten this sub-range during
8008 /// buildVPlan().
8009 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
8010                                            ElementCount MaxVF) {
8011   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8012   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8013     VFRange SubRange = {VF, MaxVFPlusOne};
8014     VPlans.push_back(buildVPlan(SubRange));
8015     VF = SubRange.End;
8016   }
8017 }
8018 
8019 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
8020                                          VPlanPtr &Plan) {
8021   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8022 
8023   // Look for cached value.
8024   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8025   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8026   if (ECEntryIt != EdgeMaskCache.end())
8027     return ECEntryIt->second;
8028 
8029   VPValue *SrcMask = createBlockInMask(Src, Plan);
8030 
8031   // The terminator has to be a branch inst!
8032   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8033   assert(BI && "Unexpected terminator found");
8034 
8035   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8036     return EdgeMaskCache[Edge] = SrcMask;
8037 
8038   // If source is an exiting block, we know the exit edge is dynamically dead
8039   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
8040   // adding uses of an otherwise potentially dead instruction.
8041   if (OrigLoop->isLoopExiting(Src))
8042     return EdgeMaskCache[Edge] = SrcMask;
8043 
8044   VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
8045   assert(EdgeMask && "No Edge Mask found for condition");
8046 
8047   if (BI->getSuccessor(0) != Dst)
8048     EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
8049 
8050   if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8051     // The condition is 'SrcMask && EdgeMask', which is equivalent to
8052     // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8053     // The select version does not introduce new UB if SrcMask is false and
8054     // EdgeMask is poison. Using 'and' here introduces undefined behavior.
8055     VPValue *False = Plan->getOrAddVPValue(
8056         ConstantInt::getFalse(BI->getCondition()->getType()));
8057     EdgeMask =
8058         Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc());
8059   }
8060 
8061   return EdgeMaskCache[Edge] = EdgeMask;
8062 }
8063 
8064 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
8065   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8066 
8067   // Look for cached value.
8068   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
8069   if (BCEntryIt != BlockMaskCache.end())
8070     return BCEntryIt->second;
8071 
8072   // All-one mask is modelled as no-mask following the convention for masked
8073   // load/store/gather/scatter. Initialize BlockMask to no-mask.
8074   VPValue *BlockMask = nullptr;
8075 
8076   if (OrigLoop->getHeader() == BB) {
8077     if (!CM.blockNeedsPredicationForAnyReason(BB))
8078       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
8079 
8080     // Introduce the early-exit compare IV <= BTC to form header block mask.
8081     // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8082     // constructing the desired canonical IV in the header block as its first
8083     // non-phi instructions.
8084     assert(CM.foldTailByMasking() && "must fold the tail");
8085     VPBasicBlock *HeaderVPBB =
8086         Plan->getVectorLoopRegion()->getEntryBasicBlock();
8087     auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8088     auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV());
8089     HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi());
8090 
8091     VPBuilder::InsertPointGuard Guard(Builder);
8092     Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
8093     if (CM.TTI.emitGetActiveLaneMask()) {
8094       VPValue *TC = Plan->getOrCreateTripCount();
8095       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC});
8096     } else {
8097       VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
8098       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
8099     }
8100     return BlockMaskCache[BB] = BlockMask;
8101   }
8102 
8103   // This is the block mask. We OR all incoming edges.
8104   for (auto *Predecessor : predecessors(BB)) {
8105     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8106     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
8107       return BlockMaskCache[BB] = EdgeMask;
8108 
8109     if (!BlockMask) { // BlockMask has its initialized nullptr value.
8110       BlockMask = EdgeMask;
8111       continue;
8112     }
8113 
8114     BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8115   }
8116 
8117   return BlockMaskCache[BB] = BlockMask;
8118 }
8119 
8120 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
8121                                                 ArrayRef<VPValue *> Operands,
8122                                                 VFRange &Range,
8123                                                 VPlanPtr &Plan) {
8124   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8125          "Must be called with either a load or store");
8126 
8127   auto willWiden = [&](ElementCount VF) -> bool {
8128     LoopVectorizationCostModel::InstWidening Decision =
8129         CM.getWideningDecision(I, VF);
8130     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8131            "CM decision should be taken at this point.");
8132     if (Decision == LoopVectorizationCostModel::CM_Interleave)
8133       return true;
8134     if (CM.isScalarAfterVectorization(I, VF) ||
8135         CM.isProfitableToScalarize(I, VF))
8136       return false;
8137     return Decision != LoopVectorizationCostModel::CM_Scalarize;
8138   };
8139 
8140   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8141     return nullptr;
8142 
8143   VPValue *Mask = nullptr;
8144   if (Legal->isMaskRequired(I))
8145     Mask = createBlockInMask(I->getParent(), Plan);
8146 
8147   // Determine if the pointer operand of the access is either consecutive or
8148   // reverse consecutive.
8149   LoopVectorizationCostModel::InstWidening Decision =
8150       CM.getWideningDecision(I, Range.Start);
8151   bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
8152   bool Consecutive =
8153       Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
8154 
8155   if (LoadInst *Load = dyn_cast<LoadInst>(I))
8156     return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask,
8157                                               Consecutive, Reverse);
8158 
8159   StoreInst *Store = cast<StoreInst>(I);
8160   return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0],
8161                                             Mask, Consecutive, Reverse);
8162 }
8163 
8164 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
8165 /// insert a recipe to expand the step for the induction recipe.
8166 static VPWidenIntOrFpInductionRecipe *createWidenInductionRecipes(
8167     PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start,
8168     const InductionDescriptor &IndDesc, LoopVectorizationCostModel &CM,
8169     VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, VFRange &Range) {
8170   // Returns true if an instruction \p I should be scalarized instead of
8171   // vectorized for the chosen vectorization factor.
8172   auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) {
8173     return CM.isScalarAfterVectorization(I, VF) ||
8174            CM.isProfitableToScalarize(I, VF);
8175   };
8176 
8177   bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange(
8178       [&](ElementCount VF) {
8179         return ShouldScalarizeInstruction(PhiOrTrunc, VF);
8180       },
8181       Range);
8182   assert(IndDesc.getStartValue() ==
8183          Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
8184   assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
8185          "step must be loop invariant");
8186 
8187   VPValue *Step =
8188       vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE);
8189   if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8190     return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI,
8191                                              !NeedsScalarIVOnly);
8192   }
8193   assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
8194   return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc,
8195                                            !NeedsScalarIVOnly);
8196 }
8197 
8198 VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI(
8199     PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) {
8200 
8201   // Check if this is an integer or fp induction. If so, build the recipe that
8202   // produces its scalar and vector values.
8203   if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8204     return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, CM, Plan,
8205                                        *PSE.getSE(), *OrigLoop, Range);
8206 
8207   // Check if this is pointer induction. If so, build the recipe for it.
8208   if (auto *II = Legal->getPointerInductionDescriptor(Phi))
8209     return new VPWidenPointerInductionRecipe(Phi, Operands[0], *II,
8210                                              *PSE.getSE());
8211   return nullptr;
8212 }
8213 
8214 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8215     TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) {
8216   // Optimize the special case where the source is a constant integer
8217   // induction variable. Notice that we can only optimize the 'trunc' case
8218   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8219   // (c) other casts depend on pointer size.
8220 
8221   // Determine whether \p K is a truncation based on an induction variable that
8222   // can be optimized.
8223   auto isOptimizableIVTruncate =
8224       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8225     return [=](ElementCount VF) -> bool {
8226       return CM.isOptimizableIVTruncate(K, VF);
8227     };
8228   };
8229 
8230   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8231           isOptimizableIVTruncate(I), Range)) {
8232 
8233     auto *Phi = cast<PHINode>(I->getOperand(0));
8234     const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8235     VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
8236     return createWidenInductionRecipes(Phi, I, Start, II, CM, Plan,
8237                                        *PSE.getSE(), *OrigLoop, Range);
8238   }
8239   return nullptr;
8240 }
8241 
8242 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
8243                                                 ArrayRef<VPValue *> Operands,
8244                                                 VPlanPtr &Plan) {
8245   // If all incoming values are equal, the incoming VPValue can be used directly
8246   // instead of creating a new VPBlendRecipe.
8247   VPValue *FirstIncoming = Operands[0];
8248   if (all_of(Operands, [FirstIncoming](const VPValue *Inc) {
8249         return FirstIncoming == Inc;
8250       })) {
8251     return Operands[0];
8252   }
8253 
8254   unsigned NumIncoming = Phi->getNumIncomingValues();
8255   // For in-loop reductions, we do not need to create an additional select.
8256   VPValue *InLoopVal = nullptr;
8257   for (unsigned In = 0; In < NumIncoming; In++) {
8258     PHINode *PhiOp =
8259         dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue());
8260     if (PhiOp && CM.isInLoopReduction(PhiOp)) {
8261       assert(!InLoopVal && "Found more than one in-loop reduction!");
8262       InLoopVal = Operands[In];
8263     }
8264   }
8265 
8266   assert((!InLoopVal || NumIncoming == 2) &&
8267          "Found an in-loop reduction for PHI with unexpected number of "
8268          "incoming values");
8269   if (InLoopVal)
8270     return Operands[Operands[0] == InLoopVal ? 1 : 0];
8271 
8272   // We know that all PHIs in non-header blocks are converted into selects, so
8273   // we don't have to worry about the insertion order and we can just use the
8274   // builder. At this point we generate the predication tree. There may be
8275   // duplications since this is a simple recursive scan, but future
8276   // optimizations will clean it up.
8277   SmallVector<VPValue *, 2> OperandsWithMask;
8278 
8279   for (unsigned In = 0; In < NumIncoming; In++) {
8280     VPValue *EdgeMask =
8281       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
8282     assert((EdgeMask || NumIncoming == 1) &&
8283            "Multiple predecessors with one having a full mask");
8284     OperandsWithMask.push_back(Operands[In]);
8285     if (EdgeMask)
8286       OperandsWithMask.push_back(EdgeMask);
8287   }
8288   return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask));
8289 }
8290 
8291 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8292                                                    ArrayRef<VPValue *> Operands,
8293                                                    VFRange &Range) const {
8294 
8295   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8296       [this, CI](ElementCount VF) {
8297         return CM.isScalarWithPredication(CI, VF);
8298       },
8299       Range);
8300 
8301   if (IsPredicated)
8302     return nullptr;
8303 
8304   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8305   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8306              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8307              ID == Intrinsic::pseudoprobe ||
8308              ID == Intrinsic::experimental_noalias_scope_decl))
8309     return nullptr;
8310 
8311   auto willWiden = [&](ElementCount VF) -> bool {
8312     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8313     // The following case may be scalarized depending on the VF.
8314     // The flag shows whether we use Intrinsic or a usual Call for vectorized
8315     // version of the instruction.
8316     // Is it beneficial to perform intrinsic call compared to lib call?
8317     bool NeedToScalarize = false;
8318     InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
8319     InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;
8320     bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
8321     return UseVectorIntrinsic || !NeedToScalarize;
8322   };
8323 
8324   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8325     return nullptr;
8326 
8327   ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size());
8328   return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()));
8329 }
8330 
8331 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8332   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8333          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8334   // Instruction should be widened, unless it is scalar after vectorization,
8335   // scalarization is profitable or it is predicated.
8336   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8337     return CM.isScalarAfterVectorization(I, VF) ||
8338            CM.isProfitableToScalarize(I, VF) ||
8339            CM.isScalarWithPredication(I, VF);
8340   };
8341   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8342                                                              Range);
8343 }
8344 
8345 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8346                                            ArrayRef<VPValue *> Operands) const {
8347   auto IsVectorizableOpcode = [](unsigned Opcode) {
8348     switch (Opcode) {
8349     case Instruction::Add:
8350     case Instruction::And:
8351     case Instruction::AShr:
8352     case Instruction::BitCast:
8353     case Instruction::FAdd:
8354     case Instruction::FCmp:
8355     case Instruction::FDiv:
8356     case Instruction::FMul:
8357     case Instruction::FNeg:
8358     case Instruction::FPExt:
8359     case Instruction::FPToSI:
8360     case Instruction::FPToUI:
8361     case Instruction::FPTrunc:
8362     case Instruction::FRem:
8363     case Instruction::FSub:
8364     case Instruction::ICmp:
8365     case Instruction::IntToPtr:
8366     case Instruction::LShr:
8367     case Instruction::Mul:
8368     case Instruction::Or:
8369     case Instruction::PtrToInt:
8370     case Instruction::SDiv:
8371     case Instruction::Select:
8372     case Instruction::SExt:
8373     case Instruction::Shl:
8374     case Instruction::SIToFP:
8375     case Instruction::SRem:
8376     case Instruction::Sub:
8377     case Instruction::Trunc:
8378     case Instruction::UDiv:
8379     case Instruction::UIToFP:
8380     case Instruction::URem:
8381     case Instruction::Xor:
8382     case Instruction::ZExt:
8383     case Instruction::Freeze:
8384       return true;
8385     }
8386     return false;
8387   };
8388 
8389   if (!IsVectorizableOpcode(I->getOpcode()))
8390     return nullptr;
8391 
8392   // Success: widen this instruction.
8393   return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8394 }
8395 
8396 void VPRecipeBuilder::fixHeaderPhis() {
8397   BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8398   for (VPHeaderPHIRecipe *R : PhisToFix) {
8399     auto *PN = cast<PHINode>(R->getUnderlyingValue());
8400     VPRecipeBase *IncR =
8401         getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8402     R->addOperand(IncR->getVPSingleValue());
8403   }
8404 }
8405 
8406 VPBasicBlock *VPRecipeBuilder::handleReplication(
8407     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
8408     VPlanPtr &Plan) {
8409   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8410       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8411       Range);
8412 
8413   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8414       [&](ElementCount VF) { return CM.isPredicatedInst(I, VF, IsUniform); },
8415       Range);
8416 
8417   // Even if the instruction is not marked as uniform, there are certain
8418   // intrinsic calls that can be effectively treated as such, so we check for
8419   // them here. Conservatively, we only do this for scalable vectors, since
8420   // for fixed-width VFs we can always fall back on full scalarization.
8421   if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8422     switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8423     case Intrinsic::assume:
8424     case Intrinsic::lifetime_start:
8425     case Intrinsic::lifetime_end:
8426       // For scalable vectors if one of the operands is variant then we still
8427       // want to mark as uniform, which will generate one instruction for just
8428       // the first lane of the vector. We can't scalarize the call in the same
8429       // way as for fixed-width vectors because we don't know how many lanes
8430       // there are.
8431       //
8432       // The reasons for doing it this way for scalable vectors are:
8433       //   1. For the assume intrinsic generating the instruction for the first
8434       //      lane is still be better than not generating any at all. For
8435       //      example, the input may be a splat across all lanes.
8436       //   2. For the lifetime start/end intrinsics the pointer operand only
8437       //      does anything useful when the input comes from a stack object,
8438       //      which suggests it should always be uniform. For non-stack objects
8439       //      the effect is to poison the object, which still allows us to
8440       //      remove the call.
8441       IsUniform = true;
8442       break;
8443     default:
8444       break;
8445     }
8446   }
8447 
8448   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
8449                                        IsUniform, IsPredicated);
8450   setRecipe(I, Recipe);
8451   Plan->addVPValue(I, Recipe);
8452 
8453   // Find if I uses a predicated instruction. If so, it will use its scalar
8454   // value. Avoid hoisting the insert-element which packs the scalar value into
8455   // a vector value, as that happens iff all users use the vector value.
8456   for (VPValue *Op : Recipe->operands()) {
8457     auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef());
8458     if (!PredR)
8459       continue;
8460     auto *RepR =
8461         cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef());
8462     assert(RepR->isPredicated() &&
8463            "expected Replicate recipe to be predicated");
8464     RepR->setAlsoPack(false);
8465   }
8466 
8467   // Finalize the recipe for Instr, first if it is not predicated.
8468   if (!IsPredicated) {
8469     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8470     VPBB->appendRecipe(Recipe);
8471     return VPBB;
8472   }
8473   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8474 
8475   VPBlockBase *SingleSucc = VPBB->getSingleSuccessor();
8476   assert(SingleSucc && "VPBB must have a single successor when handling "
8477                        "predicated replication.");
8478   VPBlockUtils::disconnectBlocks(VPBB, SingleSucc);
8479   // Record predicated instructions for above packing optimizations.
8480   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
8481   VPBlockUtils::insertBlockAfter(Region, VPBB);
8482   auto *RegSucc = new VPBasicBlock();
8483   VPBlockUtils::insertBlockAfter(RegSucc, Region);
8484   VPBlockUtils::connectBlocks(RegSucc, SingleSucc);
8485   return RegSucc;
8486 }
8487 
8488 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
8489                                                       VPRecipeBase *PredRecipe,
8490                                                       VPlanPtr &Plan) {
8491   // Instructions marked for predication are replicated and placed under an
8492   // if-then construct to prevent side-effects.
8493 
8494   // Generate recipes to compute the block mask for this region.
8495   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
8496 
8497   // Build the triangular if-then region.
8498   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
8499   assert(Instr->getParent() && "Predicated instruction not in any basic block");
8500   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
8501   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
8502   auto *PHIRecipe = Instr->getType()->isVoidTy()
8503                         ? nullptr
8504                         : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));
8505   if (PHIRecipe) {
8506     Plan->removeVPValueFor(Instr);
8507     Plan->addVPValue(Instr, PHIRecipe);
8508   }
8509   auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
8510   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
8511   VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true);
8512 
8513   // Note: first set Entry as region entry and then connect successors starting
8514   // from it in order, to propagate the "parent" of each VPBasicBlock.
8515   VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
8516   VPBlockUtils::connectBlocks(Pred, Exiting);
8517 
8518   return Region;
8519 }
8520 
8521 VPRecipeOrVPValueTy
8522 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8523                                         ArrayRef<VPValue *> Operands,
8524                                         VFRange &Range, VPlanPtr &Plan) {
8525   // First, check for specific widening recipes that deal with inductions, Phi
8526   // nodes, calls and memory operations.
8527   VPRecipeBase *Recipe;
8528   if (auto Phi = dyn_cast<PHINode>(Instr)) {
8529     if (Phi->getParent() != OrigLoop->getHeader())
8530       return tryToBlend(Phi, Operands, Plan);
8531     if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range)))
8532       return toVPRecipeResult(Recipe);
8533 
8534     VPHeaderPHIRecipe *PhiRecipe = nullptr;
8535     assert((Legal->isReductionVariable(Phi) ||
8536             Legal->isFirstOrderRecurrence(Phi)) &&
8537            "can only widen reductions and first-order recurrences here");
8538     VPValue *StartV = Operands[0];
8539     if (Legal->isReductionVariable(Phi)) {
8540       const RecurrenceDescriptor &RdxDesc =
8541           Legal->getReductionVars().find(Phi)->second;
8542       assert(RdxDesc.getRecurrenceStartValue() ==
8543              Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8544       PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8545                                            CM.isInLoopReduction(Phi),
8546                                            CM.useOrderedReductions(RdxDesc));
8547     } else {
8548       PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8549     }
8550 
8551     // Record the incoming value from the backedge, so we can add the incoming
8552     // value from the backedge after all recipes have been created.
8553     recordRecipeOf(cast<Instruction>(
8554         Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())));
8555     PhisToFix.push_back(PhiRecipe);
8556     return toVPRecipeResult(PhiRecipe);
8557   }
8558 
8559   if (isa<TruncInst>(Instr) &&
8560       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,
8561                                                Range, *Plan)))
8562     return toVPRecipeResult(Recipe);
8563 
8564   // All widen recipes below deal only with VF > 1.
8565   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8566           [&](ElementCount VF) { return VF.isScalar(); }, Range))
8567     return nullptr;
8568 
8569   if (auto *CI = dyn_cast<CallInst>(Instr))
8570     return toVPRecipeResult(tryToWidenCall(CI, Operands, Range));
8571 
8572   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8573     return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
8574 
8575   if (!shouldWiden(Instr, Range))
8576     return nullptr;
8577 
8578   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8579     return toVPRecipeResult(new VPWidenGEPRecipe(
8580         GEP, make_range(Operands.begin(), Operands.end()), OrigLoop));
8581 
8582   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8583     bool InvariantCond =
8584         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
8585     return toVPRecipeResult(new VPWidenSelectRecipe(
8586         *SI, make_range(Operands.begin(), Operands.end()), InvariantCond));
8587   }
8588 
8589   return toVPRecipeResult(tryToWiden(Instr, Operands));
8590 }
8591 
8592 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8593                                                         ElementCount MaxVF) {
8594   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8595 
8596   // Collect instructions from the original loop that will become trivially dead
8597   // in the vectorized loop. We don't need to vectorize these instructions. For
8598   // example, original induction update instructions can become dead because we
8599   // separately emit induction "steps" when generating code for the new loop.
8600   // Similarly, we create a new latch condition when setting up the structure
8601   // of the new loop, so the old one can become dead.
8602   SmallPtrSet<Instruction *, 4> DeadInstructions;
8603   collectTriviallyDeadInstructions(DeadInstructions);
8604 
8605   // Add assume instructions we need to drop to DeadInstructions, to prevent
8606   // them from being added to the VPlan.
8607   // TODO: We only need to drop assumes in blocks that get flattend. If the
8608   // control flow is preserved, we should keep them.
8609   auto &ConditionalAssumes = Legal->getConditionalAssumes();
8610   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
8611 
8612   MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
8613   // Dead instructions do not need sinking. Remove them from SinkAfter.
8614   for (Instruction *I : DeadInstructions)
8615     SinkAfter.erase(I);
8616 
8617   // Cannot sink instructions after dead instructions (there won't be any
8618   // recipes for them). Instead, find the first non-dead previous instruction.
8619   for (auto &P : Legal->getSinkAfter()) {
8620     Instruction *SinkTarget = P.second;
8621     Instruction *FirstInst = &*SinkTarget->getParent()->begin();
8622     (void)FirstInst;
8623     while (DeadInstructions.contains(SinkTarget)) {
8624       assert(
8625           SinkTarget != FirstInst &&
8626           "Must find a live instruction (at least the one feeding the "
8627           "first-order recurrence PHI) before reaching beginning of the block");
8628       SinkTarget = SinkTarget->getPrevNode();
8629       assert(SinkTarget != P.first &&
8630              "sink source equals target, no sinking required");
8631     }
8632     P.second = SinkTarget;
8633   }
8634 
8635   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8636   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8637     VFRange SubRange = {VF, MaxVFPlusOne};
8638     VPlans.push_back(
8639         buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
8640     VF = SubRange.End;
8641   }
8642 }
8643 
8644 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header, a
8645 // CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a
8646 // BranchOnCount VPInstruction to the latch.
8647 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
8648                                   bool HasNUW) {
8649   Value *StartIdx = ConstantInt::get(IdxTy, 0);
8650   auto *StartV = Plan.getOrAddVPValue(StartIdx);
8651 
8652   auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
8653   VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
8654   VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
8655   Header->insert(CanonicalIVPHI, Header->begin());
8656 
8657   auto *CanonicalIVIncrement =
8658       new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW
8659                                : VPInstruction::CanonicalIVIncrement,
8660                         {CanonicalIVPHI}, DL);
8661   CanonicalIVPHI->addOperand(CanonicalIVIncrement);
8662 
8663   VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
8664   EB->appendRecipe(CanonicalIVIncrement);
8665 
8666   auto *BranchOnCount =
8667       new VPInstruction(VPInstruction::BranchOnCount,
8668                         {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
8669   EB->appendRecipe(BranchOnCount);
8670 }
8671 
8672 // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
8673 // original exit block.
8674 static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB,
8675                                 VPBasicBlock *MiddleVPBB, Loop *OrigLoop,
8676                                 VPlan &Plan) {
8677   BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock();
8678   BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
8679   // Only handle single-exit loops with unique exit blocks for now.
8680   if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB)
8681     return;
8682 
8683   // Introduce VPUsers modeling the exit values.
8684   for (PHINode &ExitPhi : ExitBB->phis()) {
8685     Value *IncomingValue =
8686         ExitPhi.getIncomingValueForBlock(ExitingBB);
8687     VPValue *V = Plan.getOrAddVPValue(IncomingValue, true);
8688     Plan.addLiveOut(&ExitPhi, V);
8689   }
8690 }
8691 
8692 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
8693     VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
8694     const MapVector<Instruction *, Instruction *> &SinkAfter) {
8695 
8696   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8697 
8698   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
8699 
8700   // ---------------------------------------------------------------------------
8701   // Pre-construction: record ingredients whose recipes we'll need to further
8702   // process after constructing the initial VPlan.
8703   // ---------------------------------------------------------------------------
8704 
8705   // Mark instructions we'll need to sink later and their targets as
8706   // ingredients whose recipe we'll need to record.
8707   for (auto &Entry : SinkAfter) {
8708     RecipeBuilder.recordRecipeOf(Entry.first);
8709     RecipeBuilder.recordRecipeOf(Entry.second);
8710   }
8711   for (auto &Reduction : CM.getInLoopReductionChains()) {
8712     PHINode *Phi = Reduction.first;
8713     RecurKind Kind =
8714         Legal->getReductionVars().find(Phi)->second.getRecurrenceKind();
8715     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8716 
8717     RecipeBuilder.recordRecipeOf(Phi);
8718     for (auto &R : ReductionOperations) {
8719       RecipeBuilder.recordRecipeOf(R);
8720       // For min/max reductions, where we have a pair of icmp/select, we also
8721       // need to record the ICmp recipe, so it can be removed later.
8722       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
8723              "Only min/max recurrences allowed for inloop reductions");
8724       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
8725         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
8726     }
8727   }
8728 
8729   // For each interleave group which is relevant for this (possibly trimmed)
8730   // Range, add it to the set of groups to be later applied to the VPlan and add
8731   // placeholders for its members' Recipes which we'll be replacing with a
8732   // single VPInterleaveRecipe.
8733   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8734     auto applyIG = [IG, this](ElementCount VF) -> bool {
8735       return (VF.isVector() && // Query is illegal for VF == 1
8736               CM.getWideningDecision(IG->getInsertPos(), VF) ==
8737                   LoopVectorizationCostModel::CM_Interleave);
8738     };
8739     if (!getDecisionAndClampRange(applyIG, Range))
8740       continue;
8741     InterleaveGroups.insert(IG);
8742     for (unsigned i = 0; i < IG->getFactor(); i++)
8743       if (Instruction *Member = IG->getMember(i))
8744         RecipeBuilder.recordRecipeOf(Member);
8745   };
8746 
8747   // ---------------------------------------------------------------------------
8748   // Build initial VPlan: Scan the body of the loop in a topological order to
8749   // visit each basic block after having visited its predecessor basic blocks.
8750   // ---------------------------------------------------------------------------
8751 
8752   // Create initial VPlan skeleton, starting with a block for the pre-header,
8753   // followed by a region for the vector loop, followed by the middle block. The
8754   // skeleton vector loop region contains a header and latch block.
8755   VPBasicBlock *Preheader = new VPBasicBlock("vector.ph");
8756   auto Plan = std::make_unique<VPlan>(Preheader);
8757 
8758   VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
8759   VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
8760   VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
8761   auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop");
8762   VPBlockUtils::insertBlockAfter(TopRegion, Preheader);
8763   VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block");
8764   VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion);
8765 
8766   Instruction *DLInst =
8767       getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
8768   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(),
8769                         DLInst ? DLInst->getDebugLoc() : DebugLoc(),
8770                         !CM.foldTailByMasking());
8771 
8772   // Scan the body of the loop in a topological order to visit each basic block
8773   // after having visited its predecessor basic blocks.
8774   LoopBlocksDFS DFS(OrigLoop);
8775   DFS.perform(LI);
8776 
8777   VPBasicBlock *VPBB = HeaderVPBB;
8778   SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove;
8779   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8780     // Relevant instructions from basic block BB will be grouped into VPRecipe
8781     // ingredients and fill a new VPBasicBlock.
8782     unsigned VPBBsForBB = 0;
8783     if (VPBB != HeaderVPBB)
8784       VPBB->setName(BB->getName());
8785     Builder.setInsertPoint(VPBB);
8786 
8787     // Introduce each ingredient into VPlan.
8788     // TODO: Model and preserve debug intrinsics in VPlan.
8789     for (Instruction &I : BB->instructionsWithoutDebug()) {
8790       Instruction *Instr = &I;
8791 
8792       // First filter out irrelevant instructions, to ensure no recipes are
8793       // built for them.
8794       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
8795         continue;
8796 
8797       SmallVector<VPValue *, 4> Operands;
8798       auto *Phi = dyn_cast<PHINode>(Instr);
8799       if (Phi && Phi->getParent() == OrigLoop->getHeader()) {
8800         Operands.push_back(Plan->getOrAddVPValue(
8801             Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
8802       } else {
8803         auto OpRange = Plan->mapToVPValues(Instr->operands());
8804         Operands = {OpRange.begin(), OpRange.end()};
8805       }
8806 
8807       // Invariant stores inside loop will be deleted and a single store
8808       // with the final reduction value will be added to the exit block
8809       StoreInst *SI;
8810       if ((SI = dyn_cast<StoreInst>(&I)) &&
8811           Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
8812         continue;
8813 
8814       if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
8815               Instr, Operands, Range, Plan)) {
8816         // If Instr can be simplified to an existing VPValue, use it.
8817         if (RecipeOrValue.is<VPValue *>()) {
8818           auto *VPV = RecipeOrValue.get<VPValue *>();
8819           Plan->addVPValue(Instr, VPV);
8820           // If the re-used value is a recipe, register the recipe for the
8821           // instruction, in case the recipe for Instr needs to be recorded.
8822           if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef()))
8823             RecipeBuilder.setRecipe(Instr, R);
8824           continue;
8825         }
8826         // Otherwise, add the new recipe.
8827         VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>();
8828         for (auto *Def : Recipe->definedValues()) {
8829           auto *UV = Def->getUnderlyingValue();
8830           Plan->addVPValue(UV, Def);
8831         }
8832 
8833         if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) &&
8834             HeaderVPBB->getFirstNonPhi() != VPBB->end()) {
8835           // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section
8836           // of the header block. That can happen for truncates of induction
8837           // variables. Those recipes are moved to the phi section of the header
8838           // block after applying SinkAfter, which relies on the original
8839           // position of the trunc.
8840           assert(isa<TruncInst>(Instr));
8841           InductionsToMove.push_back(
8842               cast<VPWidenIntOrFpInductionRecipe>(Recipe));
8843         }
8844         RecipeBuilder.setRecipe(Instr, Recipe);
8845         VPBB->appendRecipe(Recipe);
8846         continue;
8847       }
8848 
8849       // Otherwise, if all widening options failed, Instruction is to be
8850       // replicated. This may create a successor for VPBB.
8851       VPBasicBlock *NextVPBB =
8852           RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan);
8853       if (NextVPBB != VPBB) {
8854         VPBB = NextVPBB;
8855         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
8856                                     : "");
8857       }
8858     }
8859 
8860     VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
8861     VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
8862   }
8863 
8864   HeaderVPBB->setName("vector.body");
8865 
8866   // Fold the last, empty block into its predecessor.
8867   VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB);
8868   assert(VPBB && "expected to fold last (empty) block");
8869   // After here, VPBB should not be used.
8870   VPBB = nullptr;
8871 
8872   addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan);
8873 
8874   assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
8875          !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
8876          "entry block must be set to a VPRegionBlock having a non-empty entry "
8877          "VPBasicBlock");
8878   RecipeBuilder.fixHeaderPhis();
8879 
8880   // ---------------------------------------------------------------------------
8881   // Transform initial VPlan: Apply previously taken decisions, in order, to
8882   // bring the VPlan to its final state.
8883   // ---------------------------------------------------------------------------
8884 
8885   // Apply Sink-After legal constraints.
8886   auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * {
8887     auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent());
8888     if (Region && Region->isReplicator()) {
8889       assert(Region->getNumSuccessors() == 1 &&
8890              Region->getNumPredecessors() == 1 && "Expected SESE region!");
8891       assert(R->getParent()->size() == 1 &&
8892              "A recipe in an original replicator region must be the only "
8893              "recipe in its block");
8894       return Region;
8895     }
8896     return nullptr;
8897   };
8898   for (auto &Entry : SinkAfter) {
8899     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
8900     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
8901 
8902     auto *TargetRegion = GetReplicateRegion(Target);
8903     auto *SinkRegion = GetReplicateRegion(Sink);
8904     if (!SinkRegion) {
8905       // If the sink source is not a replicate region, sink the recipe directly.
8906       if (TargetRegion) {
8907         // The target is in a replication region, make sure to move Sink to
8908         // the block after it, not into the replication region itself.
8909         VPBasicBlock *NextBlock =
8910             cast<VPBasicBlock>(TargetRegion->getSuccessors().front());
8911         Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi());
8912       } else
8913         Sink->moveAfter(Target);
8914       continue;
8915     }
8916 
8917     // The sink source is in a replicate region. Unhook the region from the CFG.
8918     auto *SinkPred = SinkRegion->getSinglePredecessor();
8919     auto *SinkSucc = SinkRegion->getSingleSuccessor();
8920     VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion);
8921     VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc);
8922     VPBlockUtils::connectBlocks(SinkPred, SinkSucc);
8923 
8924     if (TargetRegion) {
8925       // The target recipe is also in a replicate region, move the sink region
8926       // after the target region.
8927       auto *TargetSucc = TargetRegion->getSingleSuccessor();
8928       VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc);
8929       VPBlockUtils::connectBlocks(TargetRegion, SinkRegion);
8930       VPBlockUtils::connectBlocks(SinkRegion, TargetSucc);
8931     } else {
8932       // The sink source is in a replicate region, we need to move the whole
8933       // replicate region, which should only contain a single recipe in the
8934       // main block.
8935       auto *SplitBlock =
8936           Target->getParent()->splitAt(std::next(Target->getIterator()));
8937 
8938       auto *SplitPred = SplitBlock->getSinglePredecessor();
8939 
8940       VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock);
8941       VPBlockUtils::connectBlocks(SplitPred, SinkRegion);
8942       VPBlockUtils::connectBlocks(SinkRegion, SplitBlock);
8943     }
8944   }
8945 
8946   VPlanTransforms::removeRedundantCanonicalIVs(*Plan);
8947   VPlanTransforms::removeRedundantInductionCasts(*Plan);
8948 
8949   // Now that sink-after is done, move induction recipes for optimized truncates
8950   // to the phi section of the header block.
8951   for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove)
8952     Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
8953 
8954   // Adjust the recipes for any inloop reductions.
8955   adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExiting()), Plan,
8956                              RecipeBuilder, Range.Start);
8957 
8958   // Introduce a recipe to combine the incoming and previous values of a
8959   // first-order recurrence.
8960   for (VPRecipeBase &R :
8961        Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
8962     auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R);
8963     if (!RecurPhi)
8964       continue;
8965 
8966     VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe();
8967     VPBasicBlock *InsertBlock = PrevRecipe->getParent();
8968     auto *Region = GetReplicateRegion(PrevRecipe);
8969     if (Region)
8970       InsertBlock = cast<VPBasicBlock>(Region->getSingleSuccessor());
8971     if (Region || PrevRecipe->isPhi())
8972       Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
8973     else
8974       Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator()));
8975 
8976     auto *RecurSplice = cast<VPInstruction>(
8977         Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice,
8978                              {RecurPhi, RecurPhi->getBackedgeValue()}));
8979 
8980     RecurPhi->replaceAllUsesWith(RecurSplice);
8981     // Set the first operand of RecurSplice to RecurPhi again, after replacing
8982     // all users.
8983     RecurSplice->setOperand(0, RecurPhi);
8984   }
8985 
8986   // Interleave memory: for each Interleave Group we marked earlier as relevant
8987   // for this VPlan, replace the Recipes widening its memory instructions with a
8988   // single VPInterleaveRecipe at its insertion point.
8989   for (auto IG : InterleaveGroups) {
8990     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
8991         RecipeBuilder.getRecipe(IG->getInsertPos()));
8992     SmallVector<VPValue *, 4> StoredValues;
8993     for (unsigned i = 0; i < IG->getFactor(); ++i)
8994       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
8995         auto *StoreR =
8996             cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI));
8997         StoredValues.push_back(StoreR->getStoredValue());
8998       }
8999 
9000     auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
9001                                         Recipe->getMask());
9002     VPIG->insertBefore(Recipe);
9003     unsigned J = 0;
9004     for (unsigned i = 0; i < IG->getFactor(); ++i)
9005       if (Instruction *Member = IG->getMember(i)) {
9006         if (!Member->getType()->isVoidTy()) {
9007           VPValue *OriginalV = Plan->getVPValue(Member);
9008           Plan->removeVPValueFor(Member);
9009           Plan->addVPValue(Member, VPIG->getVPValue(J));
9010           OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
9011           J++;
9012         }
9013         RecipeBuilder.getRecipe(Member)->eraseFromParent();
9014       }
9015   }
9016 
9017   // From this point onwards, VPlan-to-VPlan transformations may change the plan
9018   // in ways that accessing values using original IR values is incorrect.
9019   Plan->disableValue2VPValue();
9020 
9021   VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE());
9022   VPlanTransforms::sinkScalarOperands(*Plan);
9023   VPlanTransforms::mergeReplicateRegions(*Plan);
9024   VPlanTransforms::removeDeadRecipes(*Plan);
9025   VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan);
9026 
9027   std::string PlanName;
9028   raw_string_ostream RSO(PlanName);
9029   ElementCount VF = Range.Start;
9030   Plan->addVF(VF);
9031   RSO << "Initial VPlan for VF={" << VF;
9032   for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
9033     Plan->addVF(VF);
9034     RSO << "," << VF;
9035   }
9036   RSO << "},UF>=1";
9037   RSO.flush();
9038   Plan->setName(PlanName);
9039 
9040   // Fold Exit block into its predecessor if possible.
9041   // TODO: Fold block earlier once all VPlan transforms properly maintain a
9042   // VPBasicBlock as exit.
9043   VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExiting());
9044 
9045   assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
9046   return Plan;
9047 }
9048 
9049 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9050   // Outer loop handling: They may require CFG and instruction level
9051   // transformations before even evaluating whether vectorization is profitable.
9052   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
9053   // the vectorization pipeline.
9054   assert(!OrigLoop->isInnermost());
9055   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
9056 
9057   // Create new empty VPlan
9058   auto Plan = std::make_unique<VPlan>();
9059 
9060   // Build hierarchical CFG
9061   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
9062   HCFGBuilder.buildHierarchicalCFG();
9063 
9064   for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
9065        VF *= 2)
9066     Plan->addVF(VF);
9067 
9068   SmallPtrSet<Instruction *, 1> DeadInstructions;
9069   VPlanTransforms::VPInstructionsToVPRecipes(
9070       OrigLoop, Plan,
9071       [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
9072       DeadInstructions, *PSE.getSE());
9073 
9074   // Remove the existing terminator of the exiting block of the top-most region.
9075   // A BranchOnCount will be added instead when adding the canonical IV recipes.
9076   auto *Term =
9077       Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
9078   Term->eraseFromParent();
9079 
9080   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(),
9081                         true);
9082   return Plan;
9083 }
9084 
9085 // Adjust the recipes for reductions. For in-loop reductions the chain of
9086 // instructions leading from the loop exit instr to the phi need to be converted
9087 // to reductions, with one operand being vector and the other being the scalar
9088 // reduction chain. For other reductions, a select is introduced between the phi
9089 // and live-out recipes when folding the tail.
9090 void LoopVectorizationPlanner::adjustRecipesForReductions(
9091     VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
9092     ElementCount MinVF) {
9093   for (auto &Reduction : CM.getInLoopReductionChains()) {
9094     PHINode *Phi = Reduction.first;
9095     const RecurrenceDescriptor &RdxDesc =
9096         Legal->getReductionVars().find(Phi)->second;
9097     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
9098 
9099     if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc))
9100       continue;
9101 
9102     // ReductionOperations are orders top-down from the phi's use to the
9103     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
9104     // which of the two operands will remain scalar and which will be reduced.
9105     // For minmax the chain will be the select instructions.
9106     Instruction *Chain = Phi;
9107     for (Instruction *R : ReductionOperations) {
9108       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
9109       RecurKind Kind = RdxDesc.getRecurrenceKind();
9110 
9111       VPValue *ChainOp = Plan->getVPValue(Chain);
9112       unsigned FirstOpId;
9113       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
9114              "Only min/max recurrences allowed for inloop reductions");
9115       // Recognize a call to the llvm.fmuladd intrinsic.
9116       bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9117       assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) &&
9118              "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9119       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9120         assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
9121                "Expected to replace a VPWidenSelectSC");
9122         FirstOpId = 1;
9123       } else {
9124         assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) ||
9125                 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) &&
9126                "Expected to replace a VPWidenSC");
9127         FirstOpId = 0;
9128       }
9129       unsigned VecOpId =
9130           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
9131       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
9132 
9133       auto *CondOp = CM.blockNeedsPredicationForAnyReason(R->getParent())
9134                          ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
9135                          : nullptr;
9136 
9137       if (IsFMulAdd) {
9138         // If the instruction is a call to the llvm.fmuladd intrinsic then we
9139         // need to create an fmul recipe to use as the vector operand for the
9140         // fadd reduction.
9141         VPInstruction *FMulRecipe = new VPInstruction(
9142             Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))});
9143         FMulRecipe->setFastMathFlags(R->getFastMathFlags());
9144         WidenRecipe->getParent()->insert(FMulRecipe,
9145                                          WidenRecipe->getIterator());
9146         VecOp = FMulRecipe;
9147       }
9148       VPReductionRecipe *RedRecipe =
9149           new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI);
9150       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9151       Plan->removeVPValueFor(R);
9152       Plan->addVPValue(R, RedRecipe);
9153       WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
9154       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9155       WidenRecipe->eraseFromParent();
9156 
9157       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9158         VPRecipeBase *CompareRecipe =
9159             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
9160         assert(isa<VPWidenRecipe>(CompareRecipe) &&
9161                "Expected to replace a VPWidenSC");
9162         assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 &&
9163                "Expected no remaining users");
9164         CompareRecipe->eraseFromParent();
9165       }
9166       Chain = R;
9167     }
9168   }
9169 
9170   // If tail is folded by masking, introduce selects between the phi
9171   // and the live-out instruction of each reduction, at the beginning of the
9172   // dedicated latch block.
9173   if (CM.foldTailByMasking()) {
9174     Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin());
9175     for (VPRecipeBase &R :
9176          Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9177       VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9178       if (!PhiR || PhiR->isInLoop())
9179         continue;
9180       VPValue *Cond =
9181           RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
9182       VPValue *Red = PhiR->getBackedgeValue();
9183       assert(cast<VPRecipeBase>(Red->getDef())->getParent() != LatchVPBB &&
9184              "reduction recipe must be defined before latch");
9185       Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR});
9186     }
9187   }
9188 }
9189 
9190 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
9191 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
9192                                VPSlotTracker &SlotTracker) const {
9193   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9194   IG->getInsertPos()->printAsOperand(O, false);
9195   O << ", ";
9196   getAddr()->printAsOperand(O, SlotTracker);
9197   VPValue *Mask = getMask();
9198   if (Mask) {
9199     O << ", ";
9200     Mask->printAsOperand(O, SlotTracker);
9201   }
9202 
9203   unsigned OpIdx = 0;
9204   for (unsigned i = 0; i < IG->getFactor(); ++i) {
9205     if (!IG->getMember(i))
9206       continue;
9207     if (getNumStoreOperands() > 0) {
9208       O << "\n" << Indent << "  store ";
9209       getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
9210       O << " to index " << i;
9211     } else {
9212       O << "\n" << Indent << "  ";
9213       getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
9214       O << " = load from index " << i;
9215     }
9216     ++OpIdx;
9217   }
9218 }
9219 #endif
9220 
9221 void VPWidenCallRecipe::execute(VPTransformState &State) {
9222   State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
9223                                   *this, State);
9224 }
9225 
9226 void VPWidenSelectRecipe::execute(VPTransformState &State) {
9227   auto &I = *cast<SelectInst>(getUnderlyingInstr());
9228   State.ILV->setDebugLocFromInst(&I);
9229 
9230   // The condition can be loop invariant  but still defined inside the
9231   // loop. This means that we can't just use the original 'cond' value.
9232   // We have to take the 'vectorized' value and pick the first lane.
9233   // Instcombine will make this a no-op.
9234   auto *InvarCond =
9235       InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr;
9236 
9237   for (unsigned Part = 0; Part < State.UF; ++Part) {
9238     Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part);
9239     Value *Op0 = State.get(getOperand(1), Part);
9240     Value *Op1 = State.get(getOperand(2), Part);
9241     Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
9242     State.set(this, Sel, Part);
9243     State.ILV->addMetadata(Sel, &I);
9244   }
9245 }
9246 
9247 void VPWidenRecipe::execute(VPTransformState &State) {
9248   auto &I = *cast<Instruction>(getUnderlyingValue());
9249   auto &Builder = State.Builder;
9250   switch (I.getOpcode()) {
9251   case Instruction::Call:
9252   case Instruction::Br:
9253   case Instruction::PHI:
9254   case Instruction::GetElementPtr:
9255   case Instruction::Select:
9256     llvm_unreachable("This instruction is handled by a different recipe.");
9257   case Instruction::UDiv:
9258   case Instruction::SDiv:
9259   case Instruction::SRem:
9260   case Instruction::URem:
9261   case Instruction::Add:
9262   case Instruction::FAdd:
9263   case Instruction::Sub:
9264   case Instruction::FSub:
9265   case Instruction::FNeg:
9266   case Instruction::Mul:
9267   case Instruction::FMul:
9268   case Instruction::FDiv:
9269   case Instruction::FRem:
9270   case Instruction::Shl:
9271   case Instruction::LShr:
9272   case Instruction::AShr:
9273   case Instruction::And:
9274   case Instruction::Or:
9275   case Instruction::Xor: {
9276     // Just widen unops and binops.
9277     State.ILV->setDebugLocFromInst(&I);
9278 
9279     for (unsigned Part = 0; Part < State.UF; ++Part) {
9280       SmallVector<Value *, 2> Ops;
9281       for (VPValue *VPOp : operands())
9282         Ops.push_back(State.get(VPOp, Part));
9283 
9284       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
9285 
9286       if (auto *VecOp = dyn_cast<Instruction>(V)) {
9287         VecOp->copyIRFlags(&I);
9288 
9289         // If the instruction is vectorized and was in a basic block that needed
9290         // predication, we can't propagate poison-generating flags (nuw/nsw,
9291         // exact, etc.). The control flow has been linearized and the
9292         // instruction is no longer guarded by the predicate, which could make
9293         // the flag properties to no longer hold.
9294         if (State.MayGeneratePoisonRecipes.contains(this))
9295           VecOp->dropPoisonGeneratingFlags();
9296       }
9297 
9298       // Use this vector value for all users of the original instruction.
9299       State.set(this, V, Part);
9300       State.ILV->addMetadata(V, &I);
9301     }
9302 
9303     break;
9304   }
9305   case Instruction::Freeze: {
9306     State.ILV->setDebugLocFromInst(&I);
9307 
9308     for (unsigned Part = 0; Part < State.UF; ++Part) {
9309       Value *Op = State.get(getOperand(0), Part);
9310 
9311       Value *Freeze = Builder.CreateFreeze(Op);
9312       State.set(this, Freeze, Part);
9313     }
9314     break;
9315   }
9316   case Instruction::ICmp:
9317   case Instruction::FCmp: {
9318     // Widen compares. Generate vector compares.
9319     bool FCmp = (I.getOpcode() == Instruction::FCmp);
9320     auto *Cmp = cast<CmpInst>(&I);
9321     State.ILV->setDebugLocFromInst(Cmp);
9322     for (unsigned Part = 0; Part < State.UF; ++Part) {
9323       Value *A = State.get(getOperand(0), Part);
9324       Value *B = State.get(getOperand(1), Part);
9325       Value *C = nullptr;
9326       if (FCmp) {
9327         // Propagate fast math flags.
9328         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
9329         Builder.setFastMathFlags(Cmp->getFastMathFlags());
9330         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
9331       } else {
9332         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
9333       }
9334       State.set(this, C, Part);
9335       State.ILV->addMetadata(C, &I);
9336     }
9337 
9338     break;
9339   }
9340 
9341   case Instruction::ZExt:
9342   case Instruction::SExt:
9343   case Instruction::FPToUI:
9344   case Instruction::FPToSI:
9345   case Instruction::FPExt:
9346   case Instruction::PtrToInt:
9347   case Instruction::IntToPtr:
9348   case Instruction::SIToFP:
9349   case Instruction::UIToFP:
9350   case Instruction::Trunc:
9351   case Instruction::FPTrunc:
9352   case Instruction::BitCast: {
9353     auto *CI = cast<CastInst>(&I);
9354     State.ILV->setDebugLocFromInst(CI);
9355 
9356     /// Vectorize casts.
9357     Type *DestTy = (State.VF.isScalar())
9358                        ? CI->getType()
9359                        : VectorType::get(CI->getType(), State.VF);
9360 
9361     for (unsigned Part = 0; Part < State.UF; ++Part) {
9362       Value *A = State.get(getOperand(0), Part);
9363       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
9364       State.set(this, Cast, Part);
9365       State.ILV->addMetadata(Cast, &I);
9366     }
9367     break;
9368   }
9369   default:
9370     // This instruction is not vectorized by simple widening.
9371     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
9372     llvm_unreachable("Unhandled instruction!");
9373   } // end of switch.
9374 }
9375 
9376 void VPWidenGEPRecipe::execute(VPTransformState &State) {
9377   auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
9378   // Construct a vector GEP by widening the operands of the scalar GEP as
9379   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
9380   // results in a vector of pointers when at least one operand of the GEP
9381   // is vector-typed. Thus, to keep the representation compact, we only use
9382   // vector-typed operands for loop-varying values.
9383 
9384   if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
9385     // If we are vectorizing, but the GEP has only loop-invariant operands,
9386     // the GEP we build (by only using vector-typed operands for
9387     // loop-varying values) would be a scalar pointer. Thus, to ensure we
9388     // produce a vector of pointers, we need to either arbitrarily pick an
9389     // operand to broadcast, or broadcast a clone of the original GEP.
9390     // Here, we broadcast a clone of the original.
9391     //
9392     // TODO: If at some point we decide to scalarize instructions having
9393     //       loop-invariant operands, this special case will no longer be
9394     //       required. We would add the scalarization decision to
9395     //       collectLoopScalars() and teach getVectorValue() to broadcast
9396     //       the lane-zero scalar value.
9397     auto *Clone = State.Builder.Insert(GEP->clone());
9398     for (unsigned Part = 0; Part < State.UF; ++Part) {
9399       Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone);
9400       State.set(this, EntryPart, Part);
9401       State.ILV->addMetadata(EntryPart, GEP);
9402     }
9403   } else {
9404     // If the GEP has at least one loop-varying operand, we are sure to
9405     // produce a vector of pointers. But if we are only unrolling, we want
9406     // to produce a scalar GEP for each unroll part. Thus, the GEP we
9407     // produce with the code below will be scalar (if VF == 1) or vector
9408     // (otherwise). Note that for the unroll-only case, we still maintain
9409     // values in the vector mapping with initVector, as we do for other
9410     // instructions.
9411     for (unsigned Part = 0; Part < State.UF; ++Part) {
9412       // The pointer operand of the new GEP. If it's loop-invariant, we
9413       // won't broadcast it.
9414       auto *Ptr = IsPtrLoopInvariant
9415                       ? State.get(getOperand(0), VPIteration(0, 0))
9416                       : State.get(getOperand(0), Part);
9417 
9418       // Collect all the indices for the new GEP. If any index is
9419       // loop-invariant, we won't broadcast it.
9420       SmallVector<Value *, 4> Indices;
9421       for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
9422         VPValue *Operand = getOperand(I);
9423         if (IsIndexLoopInvariant[I - 1])
9424           Indices.push_back(State.get(Operand, VPIteration(0, 0)));
9425         else
9426           Indices.push_back(State.get(Operand, Part));
9427       }
9428 
9429       // If the GEP instruction is vectorized and was in a basic block that
9430       // needed predication, we can't propagate the poison-generating 'inbounds'
9431       // flag. The control flow has been linearized and the GEP is no longer
9432       // guarded by the predicate, which could make the 'inbounds' properties to
9433       // no longer hold.
9434       bool IsInBounds =
9435           GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0;
9436 
9437       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
9438       // but it should be a vector, otherwise.
9439       auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr,
9440                                              Indices, "", IsInBounds);
9441       assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
9442              "NewGEP is not a pointer vector");
9443       State.set(this, NewGEP, Part);
9444       State.ILV->addMetadata(NewGEP, GEP);
9445     }
9446   }
9447 }
9448 
9449 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
9450   assert(!State.Instance && "Int or FP induction being replicated.");
9451 
9452   Value *Start = getStartValue()->getLiveInIRValue();
9453   const InductionDescriptor &ID = getInductionDescriptor();
9454   TruncInst *Trunc = getTruncInst();
9455   IRBuilderBase &Builder = State.Builder;
9456   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
9457   assert(State.VF.isVector() && "must have vector VF");
9458 
9459   // The value from the original loop to which we are mapping the new induction
9460   // variable.
9461   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
9462 
9463   // Fast-math-flags propagate from the original induction instruction.
9464   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
9465   if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
9466     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
9467 
9468   // Now do the actual transformations, and start with fetching the step value.
9469   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9470 
9471   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
9472          "Expected either an induction phi-node or a truncate of it!");
9473 
9474   // Construct the initial value of the vector IV in the vector loop preheader
9475   auto CurrIP = Builder.saveIP();
9476   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
9477   Builder.SetInsertPoint(VectorPH->getTerminator());
9478   if (isa<TruncInst>(EntryVal)) {
9479     assert(Start->getType()->isIntegerTy() &&
9480            "Truncation requires an integer type");
9481     auto *TruncType = cast<IntegerType>(EntryVal->getType());
9482     Step = Builder.CreateTrunc(Step, TruncType);
9483     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
9484   }
9485 
9486   Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
9487   Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
9488   Value *SteppedStart = getStepVector(
9489       SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);
9490 
9491   // We create vector phi nodes for both integer and floating-point induction
9492   // variables. Here, we determine the kind of arithmetic we will perform.
9493   Instruction::BinaryOps AddOp;
9494   Instruction::BinaryOps MulOp;
9495   if (Step->getType()->isIntegerTy()) {
9496     AddOp = Instruction::Add;
9497     MulOp = Instruction::Mul;
9498   } else {
9499     AddOp = ID.getInductionOpcode();
9500     MulOp = Instruction::FMul;
9501   }
9502 
9503   // Multiply the vectorization factor by the step using integer or
9504   // floating-point arithmetic as appropriate.
9505   Type *StepType = Step->getType();
9506   Value *RuntimeVF;
9507   if (Step->getType()->isFloatingPointTy())
9508     RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
9509   else
9510     RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
9511   Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
9512 
9513   // Create a vector splat to use in the induction update.
9514   //
9515   // FIXME: If the step is non-constant, we create the vector splat with
9516   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
9517   //        handle a constant vector splat.
9518   Value *SplatVF = isa<Constant>(Mul)
9519                        ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
9520                        : Builder.CreateVectorSplat(State.VF, Mul);
9521   Builder.restoreIP(CurrIP);
9522 
9523   // We may need to add the step a number of times, depending on the unroll
9524   // factor. The last of those goes into the PHI.
9525   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
9526                                     &*State.CFG.PrevBB->getFirstInsertionPt());
9527   VecInd->setDebugLoc(EntryVal->getDebugLoc());
9528   Instruction *LastInduction = VecInd;
9529   for (unsigned Part = 0; Part < State.UF; ++Part) {
9530     State.set(this, LastInduction, Part);
9531 
9532     if (isa<TruncInst>(EntryVal))
9533       State.ILV->addMetadata(LastInduction, EntryVal);
9534 
9535     LastInduction = cast<Instruction>(
9536         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
9537     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
9538   }
9539 
9540   LastInduction->setName("vec.ind.next");
9541   VecInd->addIncoming(SteppedStart, VectorPH);
9542   // Add induction update using an incorrect block temporarily. The phi node
9543   // will be fixed after VPlan execution. Note that at this point the latch
9544   // block cannot be used, as it does not exist yet.
9545   // TODO: Model increment value in VPlan, by turning the recipe into a
9546   // multi-def and a subclass of VPHeaderPHIRecipe.
9547   VecInd->addIncoming(LastInduction, VectorPH);
9548 }
9549 
9550 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
9551   assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&
9552          "Not a pointer induction according to InductionDescriptor!");
9553   assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
9554          "Unexpected type.");
9555 
9556   auto *IVR = getParent()->getPlan()->getCanonicalIV();
9557   PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0));
9558 
9559   if (onlyScalarsGenerated(State.VF)) {
9560     // This is the normalized GEP that starts counting at zero.
9561     Value *PtrInd = State.Builder.CreateSExtOrTrunc(
9562         CanonicalIV, IndDesc.getStep()->getType());
9563     // Determine the number of scalars we need to generate for each unroll
9564     // iteration. If the instruction is uniform, we only need to generate the
9565     // first lane. Otherwise, we generate all VF values.
9566     bool IsUniform = vputils::onlyFirstLaneUsed(this);
9567     assert((IsUniform || !State.VF.isScalable()) &&
9568            "Cannot scalarize a scalable VF");
9569     unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
9570 
9571     for (unsigned Part = 0; Part < State.UF; ++Part) {
9572       Value *PartStart =
9573           createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part);
9574 
9575       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
9576         Value *Idx = State.Builder.CreateAdd(
9577             PartStart, ConstantInt::get(PtrInd->getType(), Lane));
9578         Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx);
9579 
9580         Value *Step = CreateStepValue(IndDesc.getStep(), SE,
9581                                       State.CFG.PrevBB->getTerminator());
9582         Value *SclrGep = emitTransformedIndex(
9583             State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, IndDesc);
9584         SclrGep->setName("next.gep");
9585         State.set(this, SclrGep, VPIteration(Part, Lane));
9586       }
9587     }
9588     return;
9589   }
9590 
9591   assert(isa<SCEVConstant>(IndDesc.getStep()) &&
9592          "Induction step not a SCEV constant!");
9593   Type *PhiType = IndDesc.getStep()->getType();
9594 
9595   // Build a pointer phi
9596   Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
9597   Type *ScStValueType = ScalarStartValue->getType();
9598   PHINode *NewPointerPhi =
9599       PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);
9600 
9601   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
9602   NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
9603 
9604   // A pointer induction, performed by using a gep
9605   const DataLayout &DL = NewPointerPhi->getModule()->getDataLayout();
9606   Instruction *InductionLoc = &*State.Builder.GetInsertPoint();
9607 
9608   const SCEV *ScalarStep = IndDesc.getStep();
9609   SCEVExpander Exp(SE, DL, "induction");
9610   Value *ScalarStepValue = Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
9611   Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
9612   Value *NumUnrolledElems =
9613       State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
9614   Value *InductionGEP = GetElementPtrInst::Create(
9615       IndDesc.getElementType(), NewPointerPhi,
9616       State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
9617       InductionLoc);
9618   // Add induction update using an incorrect block temporarily. The phi node
9619   // will be fixed after VPlan execution. Note that at this point the latch
9620   // block cannot be used, as it does not exist yet.
9621   // TODO: Model increment value in VPlan, by turning the recipe into a
9622   // multi-def and a subclass of VPHeaderPHIRecipe.
9623   NewPointerPhi->addIncoming(InductionGEP, VectorPH);
9624 
9625   // Create UF many actual address geps that use the pointer
9626   // phi as base and a vectorized version of the step value
9627   // (<step*0, ..., step*N>) as offset.
9628   for (unsigned Part = 0; Part < State.UF; ++Part) {
9629     Type *VecPhiType = VectorType::get(PhiType, State.VF);
9630     Value *StartOffsetScalar =
9631         State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
9632     Value *StartOffset =
9633         State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
9634     // Create a vector of consecutive numbers from zero to VF.
9635     StartOffset = State.Builder.CreateAdd(
9636         StartOffset, State.Builder.CreateStepVector(VecPhiType));
9637 
9638     Value *GEP = State.Builder.CreateGEP(
9639         IndDesc.getElementType(), NewPointerPhi,
9640         State.Builder.CreateMul(
9641             StartOffset,
9642             State.Builder.CreateVectorSplat(State.VF, ScalarStepValue),
9643             "vector.gep"));
9644     State.set(this, GEP, Part);
9645   }
9646 }
9647 
9648 void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
9649   assert(!State.Instance && "VPScalarIVStepsRecipe being replicated.");
9650 
9651   // Fast-math-flags propagate from the original induction instruction.
9652   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9653   if (IndDesc.getInductionBinOp() &&
9654       isa<FPMathOperator>(IndDesc.getInductionBinOp()))
9655     State.Builder.setFastMathFlags(
9656         IndDesc.getInductionBinOp()->getFastMathFlags());
9657 
9658   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9659   auto CreateScalarIV = [&](Value *&Step) -> Value * {
9660     Value *ScalarIV = State.get(getCanonicalIV(), VPIteration(0, 0));
9661     auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0);
9662     if (!isCanonical() || CanonicalIV->getType() != Ty) {
9663       ScalarIV =
9664           Ty->isIntegerTy()
9665               ? State.Builder.CreateSExtOrTrunc(ScalarIV, Ty)
9666               : State.Builder.CreateCast(Instruction::SIToFP, ScalarIV, Ty);
9667       ScalarIV = emitTransformedIndex(State.Builder, ScalarIV,
9668                                       getStartValue()->getLiveInIRValue(), Step,
9669                                       IndDesc);
9670       ScalarIV->setName("offset.idx");
9671     }
9672     if (TruncToTy) {
9673       assert(Step->getType()->isIntegerTy() &&
9674              "Truncation requires an integer step");
9675       ScalarIV = State.Builder.CreateTrunc(ScalarIV, TruncToTy);
9676       Step = State.Builder.CreateTrunc(Step, TruncToTy);
9677     }
9678     return ScalarIV;
9679   };
9680 
9681   Value *ScalarIV = CreateScalarIV(Step);
9682   if (State.VF.isVector()) {
9683     buildScalarSteps(ScalarIV, Step, IndDesc, this, State);
9684     return;
9685   }
9686 
9687   for (unsigned Part = 0; Part < State.UF; ++Part) {
9688     assert(!State.VF.isScalable() && "scalable vectors not yet supported.");
9689     Value *EntryPart;
9690     if (Step->getType()->isFloatingPointTy()) {
9691       Value *StartIdx =
9692           getRuntimeVFAsFloat(State.Builder, Step->getType(), State.VF * Part);
9693       // Floating-point operations inherit FMF via the builder's flags.
9694       Value *MulOp = State.Builder.CreateFMul(StartIdx, Step);
9695       EntryPart = State.Builder.CreateBinOp(IndDesc.getInductionOpcode(),
9696                                             ScalarIV, MulOp);
9697     } else {
9698       Value *StartIdx =
9699           getRuntimeVF(State.Builder, Step->getType(), State.VF * Part);
9700       EntryPart = State.Builder.CreateAdd(
9701           ScalarIV, State.Builder.CreateMul(StartIdx, Step), "induction");
9702     }
9703     State.set(this, EntryPart, Part);
9704   }
9705 }
9706 
9707 void VPWidenPHIRecipe::execute(VPTransformState &State) {
9708   State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this,
9709                                  State);
9710 }
9711 
9712 void VPBlendRecipe::execute(VPTransformState &State) {
9713   State.ILV->setDebugLocFromInst(Phi, &State.Builder);
9714   // We know that all PHIs in non-header blocks are converted into
9715   // selects, so we don't have to worry about the insertion order and we
9716   // can just use the builder.
9717   // At this point we generate the predication tree. There may be
9718   // duplications since this is a simple recursive scan, but future
9719   // optimizations will clean it up.
9720 
9721   unsigned NumIncoming = getNumIncomingValues();
9722 
9723   // Generate a sequence of selects of the form:
9724   // SELECT(Mask3, In3,
9725   //        SELECT(Mask2, In2,
9726   //               SELECT(Mask1, In1,
9727   //                      In0)))
9728   // Note that Mask0 is never used: lanes for which no path reaches this phi and
9729   // are essentially undef are taken from In0.
9730   InnerLoopVectorizer::VectorParts Entry(State.UF);
9731   for (unsigned In = 0; In < NumIncoming; ++In) {
9732     for (unsigned Part = 0; Part < State.UF; ++Part) {
9733       // We might have single edge PHIs (blocks) - use an identity
9734       // 'select' for the first PHI operand.
9735       Value *In0 = State.get(getIncomingValue(In), Part);
9736       if (In == 0)
9737         Entry[Part] = In0; // Initialize with the first incoming value.
9738       else {
9739         // Select between the current value and the previous incoming edge
9740         // based on the incoming mask.
9741         Value *Cond = State.get(getMask(In), Part);
9742         Entry[Part] =
9743             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
9744       }
9745     }
9746   }
9747   for (unsigned Part = 0; Part < State.UF; ++Part)
9748     State.set(this, Entry[Part], Part);
9749 }
9750 
9751 void VPInterleaveRecipe::execute(VPTransformState &State) {
9752   assert(!State.Instance && "Interleave group being replicated.");
9753   State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9754                                       getStoredValues(), getMask());
9755 }
9756 
9757 void VPReductionRecipe::execute(VPTransformState &State) {
9758   assert(!State.Instance && "Reduction being replicated.");
9759   Value *PrevInChain = State.get(getChainOp(), 0);
9760   RecurKind Kind = RdxDesc->getRecurrenceKind();
9761   bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc);
9762   // Propagate the fast-math flags carried by the underlying instruction.
9763   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
9764   State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags());
9765   for (unsigned Part = 0; Part < State.UF; ++Part) {
9766     Value *NewVecOp = State.get(getVecOp(), Part);
9767     if (VPValue *Cond = getCondOp()) {
9768       Value *NewCond = State.get(Cond, Part);
9769       VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
9770       Value *Iden = RdxDesc->getRecurrenceIdentity(
9771           Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags());
9772       Value *IdenVec =
9773           State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
9774       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
9775       NewVecOp = Select;
9776     }
9777     Value *NewRed;
9778     Value *NextInChain;
9779     if (IsOrdered) {
9780       if (State.VF.isVector())
9781         NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp,
9782                                         PrevInChain);
9783       else
9784         NewRed = State.Builder.CreateBinOp(
9785             (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain,
9786             NewVecOp);
9787       PrevInChain = NewRed;
9788     } else {
9789       PrevInChain = State.get(getChainOp(), Part);
9790       NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
9791     }
9792     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9793       NextInChain =
9794           createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(),
9795                          NewRed, PrevInChain);
9796     } else if (IsOrdered)
9797       NextInChain = NewRed;
9798     else
9799       NextInChain = State.Builder.CreateBinOp(
9800           (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed,
9801           PrevInChain);
9802     State.set(this, NextInChain, Part);
9803   }
9804 }
9805 
9806 void VPReplicateRecipe::execute(VPTransformState &State) {
9807   if (State.Instance) { // Generate a single instance.
9808     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9809     State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance,
9810                                     IsPredicated, State);
9811     // Insert scalar instance packing it into a vector.
9812     if (AlsoPack && State.VF.isVector()) {
9813       // If we're constructing lane 0, initialize to start from poison.
9814       if (State.Instance->Lane.isFirstLane()) {
9815         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9816         Value *Poison = PoisonValue::get(
9817             VectorType::get(getUnderlyingValue()->getType(), State.VF));
9818         State.set(this, Poison, State.Instance->Part);
9819       }
9820       State.ILV->packScalarIntoVectorValue(this, *State.Instance, State);
9821     }
9822     return;
9823   }
9824 
9825   // Generate scalar instances for all VF lanes of all UF parts, unless the
9826   // instruction is uniform inwhich case generate only the first lane for each
9827   // of the UF parts.
9828   unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
9829   assert((!State.VF.isScalable() || IsUniform) &&
9830          "Can't scalarize a scalable vector");
9831   for (unsigned Part = 0; Part < State.UF; ++Part)
9832     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9833       State.ILV->scalarizeInstruction(getUnderlyingInstr(), this,
9834                                       VPIteration(Part, Lane), IsPredicated,
9835                                       State);
9836 }
9837 
9838 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
9839   assert(State.Instance && "Branch on Mask works only on single instance.");
9840 
9841   unsigned Part = State.Instance->Part;
9842   unsigned Lane = State.Instance->Lane.getKnownLane();
9843 
9844   Value *ConditionBit = nullptr;
9845   VPValue *BlockInMask = getMask();
9846   if (BlockInMask) {
9847     ConditionBit = State.get(BlockInMask, Part);
9848     if (ConditionBit->getType()->isVectorTy())
9849       ConditionBit = State.Builder.CreateExtractElement(
9850           ConditionBit, State.Builder.getInt32(Lane));
9851   } else // Block in mask is all-one.
9852     ConditionBit = State.Builder.getTrue();
9853 
9854   // Replace the temporary unreachable terminator with a new conditional branch,
9855   // whose two destinations will be set later when they are created.
9856   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
9857   assert(isa<UnreachableInst>(CurrentTerminator) &&
9858          "Expected to replace unreachable terminator with conditional branch.");
9859   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
9860   CondBr->setSuccessor(0, nullptr);
9861   ReplaceInstWithInst(CurrentTerminator, CondBr);
9862 }
9863 
9864 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
9865   assert(State.Instance && "Predicated instruction PHI works per instance.");
9866   Instruction *ScalarPredInst =
9867       cast<Instruction>(State.get(getOperand(0), *State.Instance));
9868   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
9869   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
9870   assert(PredicatingBB && "Predicated block has no single predecessor.");
9871   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
9872          "operand must be VPReplicateRecipe");
9873 
9874   // By current pack/unpack logic we need to generate only a single phi node: if
9875   // a vector value for the predicated instruction exists at this point it means
9876   // the instruction has vector users only, and a phi for the vector value is
9877   // needed. In this case the recipe of the predicated instruction is marked to
9878   // also do that packing, thereby "hoisting" the insert-element sequence.
9879   // Otherwise, a phi node for the scalar value is needed.
9880   unsigned Part = State.Instance->Part;
9881   if (State.hasVectorValue(getOperand(0), Part)) {
9882     Value *VectorValue = State.get(getOperand(0), Part);
9883     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
9884     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
9885     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
9886     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
9887     if (State.hasVectorValue(this, Part))
9888       State.reset(this, VPhi, Part);
9889     else
9890       State.set(this, VPhi, Part);
9891     // NOTE: Currently we need to update the value of the operand, so the next
9892     // predicated iteration inserts its generated value in the correct vector.
9893     State.reset(getOperand(0), VPhi, Part);
9894   } else {
9895     Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
9896     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
9897     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
9898                      PredicatingBB);
9899     Phi->addIncoming(ScalarPredInst, PredicatedBB);
9900     if (State.hasScalarValue(this, *State.Instance))
9901       State.reset(this, Phi, *State.Instance);
9902     else
9903       State.set(this, Phi, *State.Instance);
9904     // NOTE: Currently we need to update the value of the operand, so the next
9905     // predicated iteration inserts its generated value in the correct vector.
9906     State.reset(getOperand(0), Phi, *State.Instance);
9907   }
9908 }
9909 
9910 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
9911   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
9912 
9913   // Attempt to issue a wide load.
9914   LoadInst *LI = dyn_cast<LoadInst>(&Ingredient);
9915   StoreInst *SI = dyn_cast<StoreInst>(&Ingredient);
9916 
9917   assert((LI || SI) && "Invalid Load/Store instruction");
9918   assert((!SI || StoredValue) && "No stored value provided for widened store");
9919   assert((!LI || !StoredValue) && "Stored value provided for widened load");
9920 
9921   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9922 
9923   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9924   const Align Alignment = getLoadStoreAlignment(&Ingredient);
9925   bool CreateGatherScatter = !Consecutive;
9926 
9927   auto &Builder = State.Builder;
9928   InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);
9929   bool isMaskRequired = getMask();
9930   if (isMaskRequired)
9931     for (unsigned Part = 0; Part < State.UF; ++Part)
9932       BlockInMaskParts[Part] = State.get(getMask(), Part);
9933 
9934   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
9935     // Calculate the pointer for the specific unroll-part.
9936     GetElementPtrInst *PartPtr = nullptr;
9937 
9938     bool InBounds = false;
9939     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
9940       InBounds = gep->isInBounds();
9941     if (Reverse) {
9942       // If the address is consecutive but reversed, then the
9943       // wide store needs to start at the last vector element.
9944       // RunTimeVF =  VScale * VF.getKnownMinValue()
9945       // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
9946       Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF);
9947       // NumElt = -Part * RunTimeVF
9948       Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
9949       // LastLane = 1 - RunTimeVF
9950       Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
9951       PartPtr =
9952           cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
9953       PartPtr->setIsInBounds(InBounds);
9954       PartPtr = cast<GetElementPtrInst>(
9955           Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
9956       PartPtr->setIsInBounds(InBounds);
9957       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
9958         BlockInMaskParts[Part] =
9959             Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse");
9960     } else {
9961       Value *Increment =
9962           createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part);
9963       PartPtr = cast<GetElementPtrInst>(
9964           Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
9965       PartPtr->setIsInBounds(InBounds);
9966     }
9967 
9968     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
9969     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
9970   };
9971 
9972   // Handle Stores:
9973   if (SI) {
9974     State.ILV->setDebugLocFromInst(SI);
9975 
9976     for (unsigned Part = 0; Part < State.UF; ++Part) {
9977       Instruction *NewSI = nullptr;
9978       Value *StoredVal = State.get(StoredValue, Part);
9979       if (CreateGatherScatter) {
9980         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9981         Value *VectorGep = State.get(getAddr(), Part);
9982         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
9983                                             MaskPart);
9984       } else {
9985         if (Reverse) {
9986           // If we store to reverse consecutive memory locations, then we need
9987           // to reverse the order of elements in the stored value.
9988           StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
9989           // We don't want to update the value in the map as it might be used in
9990           // another expression. So don't call resetVectorValue(StoredVal).
9991         }
9992         auto *VecPtr =
9993             CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
9994         if (isMaskRequired)
9995           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
9996                                             BlockInMaskParts[Part]);
9997         else
9998           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
9999       }
10000       State.ILV->addMetadata(NewSI, SI);
10001     }
10002     return;
10003   }
10004 
10005   // Handle loads.
10006   assert(LI && "Must have a load instruction");
10007   State.ILV->setDebugLocFromInst(LI);
10008   for (unsigned Part = 0; Part < State.UF; ++Part) {
10009     Value *NewLI;
10010     if (CreateGatherScatter) {
10011       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
10012       Value *VectorGep = State.get(getAddr(), Part);
10013       NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
10014                                          nullptr, "wide.masked.gather");
10015       State.ILV->addMetadata(NewLI, LI);
10016     } else {
10017       auto *VecPtr =
10018           CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
10019       if (isMaskRequired)
10020         NewLI = Builder.CreateMaskedLoad(
10021             DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
10022             PoisonValue::get(DataTy), "wide.masked.load");
10023       else
10024         NewLI =
10025             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
10026 
10027       // Add metadata to the load, but setVectorValue to the reverse shuffle.
10028       State.ILV->addMetadata(NewLI, LI);
10029       if (Reverse)
10030         NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
10031     }
10032 
10033     State.set(getVPSingleValue(), NewLI, Part);
10034   }
10035 }
10036 
10037 // Determine how to lower the scalar epilogue, which depends on 1) optimising
10038 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
10039 // predication, and 4) a TTI hook that analyses whether the loop is suitable
10040 // for predication.
10041 static ScalarEpilogueLowering getScalarEpilogueLowering(
10042     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
10043     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
10044     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
10045     LoopVectorizationLegality &LVL) {
10046   // 1) OptSize takes precedence over all other options, i.e. if this is set,
10047   // don't look at hints or options, and don't request a scalar epilogue.
10048   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
10049   // LoopAccessInfo (due to code dependency and not being able to reliably get
10050   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
10051   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
10052   // versioning when the vectorization is forced, unlike hasOptSize. So revert
10053   // back to the old way and vectorize with versioning when forced. See D81345.)
10054   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
10055                                                       PGSOQueryType::IRPass) &&
10056                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
10057     return CM_ScalarEpilogueNotAllowedOptSize;
10058 
10059   // 2) If set, obey the directives
10060   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
10061     switch (PreferPredicateOverEpilogue) {
10062     case PreferPredicateTy::ScalarEpilogue:
10063       return CM_ScalarEpilogueAllowed;
10064     case PreferPredicateTy::PredicateElseScalarEpilogue:
10065       return CM_ScalarEpilogueNotNeededUsePredicate;
10066     case PreferPredicateTy::PredicateOrDontVectorize:
10067       return CM_ScalarEpilogueNotAllowedUsePredicate;
10068     };
10069   }
10070 
10071   // 3) If set, obey the hints
10072   switch (Hints.getPredicate()) {
10073   case LoopVectorizeHints::FK_Enabled:
10074     return CM_ScalarEpilogueNotNeededUsePredicate;
10075   case LoopVectorizeHints::FK_Disabled:
10076     return CM_ScalarEpilogueAllowed;
10077   };
10078 
10079   // 4) if the TTI hook indicates this is profitable, request predication.
10080   if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
10081                                        LVL.getLAI()))
10082     return CM_ScalarEpilogueNotNeededUsePredicate;
10083 
10084   return CM_ScalarEpilogueAllowed;
10085 }
10086 
10087 Value *VPTransformState::get(VPValue *Def, unsigned Part) {
10088   // If Values have been set for this Def return the one relevant for \p Part.
10089   if (hasVectorValue(Def, Part))
10090     return Data.PerPartOutput[Def][Part];
10091 
10092   if (!hasScalarValue(Def, {Part, 0})) {
10093     Value *IRV = Def->getLiveInIRValue();
10094     Value *B = ILV->getBroadcastInstrs(IRV);
10095     set(Def, B, Part);
10096     return B;
10097   }
10098 
10099   Value *ScalarValue = get(Def, {Part, 0});
10100   // If we aren't vectorizing, we can just copy the scalar map values over
10101   // to the vector map.
10102   if (VF.isScalar()) {
10103     set(Def, ScalarValue, Part);
10104     return ScalarValue;
10105   }
10106 
10107   auto *RepR = dyn_cast<VPReplicateRecipe>(Def);
10108   bool IsUniform = RepR && RepR->isUniform();
10109 
10110   unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;
10111   // Check if there is a scalar value for the selected lane.
10112   if (!hasScalarValue(Def, {Part, LastLane})) {
10113     // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform.
10114     assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) ||
10115             isa<VPScalarIVStepsRecipe>(Def->getDef())) &&
10116            "unexpected recipe found to be invariant");
10117     IsUniform = true;
10118     LastLane = 0;
10119   }
10120 
10121   auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane}));
10122   // Set the insert point after the last scalarized instruction or after the
10123   // last PHI, if LastInst is a PHI. This ensures the insertelement sequence
10124   // will directly follow the scalar definitions.
10125   auto OldIP = Builder.saveIP();
10126   auto NewIP =
10127       isa<PHINode>(LastInst)
10128           ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI())
10129           : std::next(BasicBlock::iterator(LastInst));
10130   Builder.SetInsertPoint(&*NewIP);
10131 
10132   // However, if we are vectorizing, we need to construct the vector values.
10133   // If the value is known to be uniform after vectorization, we can just
10134   // broadcast the scalar value corresponding to lane zero for each unroll
10135   // iteration. Otherwise, we construct the vector values using
10136   // insertelement instructions. Since the resulting vectors are stored in
10137   // State, we will only generate the insertelements once.
10138   Value *VectorValue = nullptr;
10139   if (IsUniform) {
10140     VectorValue = ILV->getBroadcastInstrs(ScalarValue);
10141     set(Def, VectorValue, Part);
10142   } else {
10143     // Initialize packing with insertelements to start from undef.
10144     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
10145     Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
10146     set(Def, Undef, Part);
10147     for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
10148       ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this);
10149     VectorValue = get(Def, Part);
10150   }
10151   Builder.restoreIP(OldIP);
10152   return VectorValue;
10153 }
10154 
10155 // Process the loop in the VPlan-native vectorization path. This path builds
10156 // VPlan upfront in the vectorization pipeline, which allows to apply
10157 // VPlan-to-VPlan transformations from the very beginning without modifying the
10158 // input LLVM IR.
10159 static bool processLoopInVPlanNativePath(
10160     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
10161     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
10162     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
10163     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
10164     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
10165     LoopVectorizationRequirements &Requirements) {
10166 
10167   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
10168     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
10169     return false;
10170   }
10171   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
10172   Function *F = L->getHeader()->getParent();
10173   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
10174 
10175   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10176       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
10177 
10178   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
10179                                 &Hints, IAI);
10180   // Use the planner for outer loop vectorization.
10181   // TODO: CM is not used at this point inside the planner. Turn CM into an
10182   // optional argument if we don't need it in the future.
10183   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints,
10184                                Requirements, ORE);
10185 
10186   // Get user vectorization factor.
10187   ElementCount UserVF = Hints.getWidth();
10188 
10189   CM.collectElementTypesForWidening();
10190 
10191   // Plan how to best vectorize, return the best VF and its cost.
10192   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
10193 
10194   // If we are stress testing VPlan builds, do not attempt to generate vector
10195   // code. Masked vector code generation support will follow soon.
10196   // Also, do not attempt to vectorize if no vector code will be produced.
10197   if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF)
10198     return false;
10199 
10200   VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10201 
10202   {
10203     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
10204                              F->getParent()->getDataLayout());
10205     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
10206                            &CM, BFI, PSI, Checks);
10207     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
10208                       << L->getHeader()->getParent()->getName() << "\"\n");
10209     LVP.executePlan(VF.Width, 1, BestPlan, LB, DT);
10210   }
10211 
10212   // Mark the loop as already vectorized to avoid vectorizing again.
10213   Hints.setAlreadyVectorized();
10214   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10215   return true;
10216 }
10217 
10218 // Emit a remark if there are stores to floats that required a floating point
10219 // extension. If the vectorized loop was generated with floating point there
10220 // will be a performance penalty from the conversion overhead and the change in
10221 // the vector width.
10222 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
10223   SmallVector<Instruction *, 4> Worklist;
10224   for (BasicBlock *BB : L->getBlocks()) {
10225     for (Instruction &Inst : *BB) {
10226       if (auto *S = dyn_cast<StoreInst>(&Inst)) {
10227         if (S->getValueOperand()->getType()->isFloatTy())
10228           Worklist.push_back(S);
10229       }
10230     }
10231   }
10232 
10233   // Traverse the floating point stores upwards searching, for floating point
10234   // conversions.
10235   SmallPtrSet<const Instruction *, 4> Visited;
10236   SmallPtrSet<const Instruction *, 4> EmittedRemark;
10237   while (!Worklist.empty()) {
10238     auto *I = Worklist.pop_back_val();
10239     if (!L->contains(I))
10240       continue;
10241     if (!Visited.insert(I).second)
10242       continue;
10243 
10244     // Emit a remark if the floating point store required a floating
10245     // point conversion.
10246     // TODO: More work could be done to identify the root cause such as a
10247     // constant or a function return type and point the user to it.
10248     if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
10249       ORE->emit([&]() {
10250         return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
10251                                           I->getDebugLoc(), L->getHeader())
10252                << "floating point conversion changes vector width. "
10253                << "Mixed floating point precision requires an up/down "
10254                << "cast that will negatively impact performance.";
10255       });
10256 
10257     for (Use &Op : I->operands())
10258       if (auto *OpI = dyn_cast<Instruction>(Op))
10259         Worklist.push_back(OpI);
10260   }
10261 }
10262 
10263 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
10264     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
10265                                !EnableLoopInterleaving),
10266       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
10267                               !EnableLoopVectorization) {}
10268 
10269 bool LoopVectorizePass::processLoop(Loop *L) {
10270   assert((EnableVPlanNativePath || L->isInnermost()) &&
10271          "VPlan-native path is not enabled. Only process inner loops.");
10272 
10273 #ifndef NDEBUG
10274   const std::string DebugLocStr = getDebugLocString(L);
10275 #endif /* NDEBUG */
10276 
10277   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
10278                     << L->getHeader()->getParent()->getName() << "' from "
10279                     << DebugLocStr << "\n");
10280 
10281   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
10282 
10283   LLVM_DEBUG(
10284       dbgs() << "LV: Loop hints:"
10285              << " force="
10286              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
10287                      ? "disabled"
10288                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
10289                             ? "enabled"
10290                             : "?"))
10291              << " width=" << Hints.getWidth()
10292              << " interleave=" << Hints.getInterleave() << "\n");
10293 
10294   // Function containing loop
10295   Function *F = L->getHeader()->getParent();
10296 
10297   // Looking at the diagnostic output is the only way to determine if a loop
10298   // was vectorized (other than looking at the IR or machine code), so it
10299   // is important to generate an optimization remark for each loop. Most of
10300   // these messages are generated as OptimizationRemarkAnalysis. Remarks
10301   // generated as OptimizationRemark and OptimizationRemarkMissed are
10302   // less verbose reporting vectorized loops and unvectorized loops that may
10303   // benefit from vectorization, respectively.
10304 
10305   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
10306     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
10307     return false;
10308   }
10309 
10310   PredicatedScalarEvolution PSE(*SE, *L);
10311 
10312   // Check if it is legal to vectorize the loop.
10313   LoopVectorizationRequirements Requirements;
10314   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
10315                                 &Requirements, &Hints, DB, AC, BFI, PSI);
10316   if (!LVL.canVectorize(EnableVPlanNativePath)) {
10317     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
10318     Hints.emitRemarkWithHints();
10319     return false;
10320   }
10321 
10322   // Check the function attributes and profiles to find out if this function
10323   // should be optimized for size.
10324   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10325       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
10326 
10327   // Entrance to the VPlan-native vectorization path. Outer loops are processed
10328   // here. They may require CFG and instruction level transformations before
10329   // even evaluating whether vectorization is profitable. Since we cannot modify
10330   // the incoming IR, we need to build VPlan upfront in the vectorization
10331   // pipeline.
10332   if (!L->isInnermost())
10333     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
10334                                         ORE, BFI, PSI, Hints, Requirements);
10335 
10336   assert(L->isInnermost() && "Inner loop expected.");
10337 
10338   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
10339   // count by optimizing for size, to minimize overheads.
10340   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
10341   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
10342     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
10343                       << "This loop is worth vectorizing only if no scalar "
10344                       << "iteration overheads are incurred.");
10345     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
10346       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
10347     else {
10348       LLVM_DEBUG(dbgs() << "\n");
10349       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
10350     }
10351   }
10352 
10353   // Check the function attributes to see if implicit floats are allowed.
10354   // FIXME: This check doesn't seem possibly correct -- what if the loop is
10355   // an integer loop and the vector instructions selected are purely integer
10356   // vector instructions?
10357   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
10358     reportVectorizationFailure(
10359         "Can't vectorize when the NoImplicitFloat attribute is used",
10360         "loop not vectorized due to NoImplicitFloat attribute",
10361         "NoImplicitFloat", ORE, L);
10362     Hints.emitRemarkWithHints();
10363     return false;
10364   }
10365 
10366   // Check if the target supports potentially unsafe FP vectorization.
10367   // FIXME: Add a check for the type of safety issue (denormal, signaling)
10368   // for the target we're vectorizing for, to make sure none of the
10369   // additional fp-math flags can help.
10370   if (Hints.isPotentiallyUnsafe() &&
10371       TTI->isFPVectorizationPotentiallyUnsafe()) {
10372     reportVectorizationFailure(
10373         "Potentially unsafe FP op prevents vectorization",
10374         "loop not vectorized due to unsafe FP support.",
10375         "UnsafeFP", ORE, L);
10376     Hints.emitRemarkWithHints();
10377     return false;
10378   }
10379 
10380   bool AllowOrderedReductions;
10381   // If the flag is set, use that instead and override the TTI behaviour.
10382   if (ForceOrderedReductions.getNumOccurrences() > 0)
10383     AllowOrderedReductions = ForceOrderedReductions;
10384   else
10385     AllowOrderedReductions = TTI->enableOrderedReductions();
10386   if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
10387     ORE->emit([&]() {
10388       auto *ExactFPMathInst = Requirements.getExactFPInst();
10389       return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
10390                                                  ExactFPMathInst->getDebugLoc(),
10391                                                  ExactFPMathInst->getParent())
10392              << "loop not vectorized: cannot prove it is safe to reorder "
10393                 "floating-point operations";
10394     });
10395     LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10396                          "reorder floating-point operations\n");
10397     Hints.emitRemarkWithHints();
10398     return false;
10399   }
10400 
10401   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
10402   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
10403 
10404   // If an override option has been passed in for interleaved accesses, use it.
10405   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
10406     UseInterleaved = EnableInterleavedMemAccesses;
10407 
10408   // Analyze interleaved memory accesses.
10409   if (UseInterleaved) {
10410     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
10411   }
10412 
10413   // Use the cost model.
10414   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10415                                 F, &Hints, IAI);
10416   CM.collectValuesToIgnore();
10417   CM.collectElementTypesForWidening();
10418 
10419   // Use the planner for vectorization.
10420   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints,
10421                                Requirements, ORE);
10422 
10423   // Get user vectorization factor and interleave count.
10424   ElementCount UserVF = Hints.getWidth();
10425   unsigned UserIC = Hints.getInterleave();
10426 
10427   // Plan how to best vectorize, return the best VF and its cost.
10428   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
10429 
10430   VectorizationFactor VF = VectorizationFactor::Disabled();
10431   unsigned IC = 1;
10432 
10433   if (MaybeVF) {
10434     if (LVP.requiresTooManyRuntimeChecks()) {
10435       ORE->emit([&]() {
10436         return OptimizationRemarkAnalysisAliasing(
10437                    DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
10438                    L->getHeader())
10439                << "loop not vectorized: cannot prove it is safe to reorder "
10440                   "memory operations";
10441       });
10442       LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
10443       Hints.emitRemarkWithHints();
10444       return false;
10445     }
10446     VF = *MaybeVF;
10447     // Select the interleave count.
10448     IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue());
10449   }
10450 
10451   // Identify the diagnostic messages that should be produced.
10452   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10453   bool VectorizeLoop = true, InterleaveLoop = true;
10454   if (VF.Width.isScalar()) {
10455     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10456     VecDiagMsg = std::make_pair(
10457         "VectorizationNotBeneficial",
10458         "the cost-model indicates that vectorization is not beneficial");
10459     VectorizeLoop = false;
10460   }
10461 
10462   if (!MaybeVF && UserIC > 1) {
10463     // Tell the user interleaving was avoided up-front, despite being explicitly
10464     // requested.
10465     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10466                          "interleaving should be avoided up front\n");
10467     IntDiagMsg = std::make_pair(
10468         "InterleavingAvoided",
10469         "Ignoring UserIC, because interleaving was avoided up front");
10470     InterleaveLoop = false;
10471   } else if (IC == 1 && UserIC <= 1) {
10472     // Tell the user interleaving is not beneficial.
10473     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10474     IntDiagMsg = std::make_pair(
10475         "InterleavingNotBeneficial",
10476         "the cost-model indicates that interleaving is not beneficial");
10477     InterleaveLoop = false;
10478     if (UserIC == 1) {
10479       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10480       IntDiagMsg.second +=
10481           " and is explicitly disabled or interleave count is set to 1";
10482     }
10483   } else if (IC > 1 && UserIC == 1) {
10484     // Tell the user interleaving is beneficial, but it explicitly disabled.
10485     LLVM_DEBUG(
10486         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10487     IntDiagMsg = std::make_pair(
10488         "InterleavingBeneficialButDisabled",
10489         "the cost-model indicates that interleaving is beneficial "
10490         "but is explicitly disabled or interleave count is set to 1");
10491     InterleaveLoop = false;
10492   }
10493 
10494   // Override IC if user provided an interleave count.
10495   IC = UserIC > 0 ? UserIC : IC;
10496 
10497   // Emit diagnostic messages, if any.
10498   const char *VAPassName = Hints.vectorizeAnalysisPassName();
10499   if (!VectorizeLoop && !InterleaveLoop) {
10500     // Do not vectorize or interleaving the loop.
10501     ORE->emit([&]() {
10502       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10503                                       L->getStartLoc(), L->getHeader())
10504              << VecDiagMsg.second;
10505     });
10506     ORE->emit([&]() {
10507       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10508                                       L->getStartLoc(), L->getHeader())
10509              << IntDiagMsg.second;
10510     });
10511     return false;
10512   } else if (!VectorizeLoop && InterleaveLoop) {
10513     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10514     ORE->emit([&]() {
10515       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10516                                         L->getStartLoc(), L->getHeader())
10517              << VecDiagMsg.second;
10518     });
10519   } else if (VectorizeLoop && !InterleaveLoop) {
10520     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10521                       << ") in " << DebugLocStr << '\n');
10522     ORE->emit([&]() {
10523       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10524                                         L->getStartLoc(), L->getHeader())
10525              << IntDiagMsg.second;
10526     });
10527   } else if (VectorizeLoop && InterleaveLoop) {
10528     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10529                       << ") in " << DebugLocStr << '\n');
10530     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10531   }
10532 
10533   bool DisableRuntimeUnroll = false;
10534   MDNode *OrigLoopID = L->getLoopID();
10535   {
10536     // Optimistically generate runtime checks. Drop them if they turn out to not
10537     // be profitable. Limit the scope of Checks, so the cleanup happens
10538     // immediately after vector codegeneration is done.
10539     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
10540                              F->getParent()->getDataLayout());
10541     if (!VF.Width.isScalar() || IC > 1)
10542       Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, IC);
10543 
10544     using namespace ore;
10545     if (!VectorizeLoop) {
10546       assert(IC > 1 && "interleave count should not be 1 or 0");
10547       // If we decided that it is not legal to vectorize the loop, then
10548       // interleave it.
10549       InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
10550                                  &CM, BFI, PSI, Checks);
10551 
10552       VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10553       LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT);
10554 
10555       ORE->emit([&]() {
10556         return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10557                                   L->getHeader())
10558                << "interleaved loop (interleaved count: "
10559                << NV("InterleaveCount", IC) << ")";
10560       });
10561     } else {
10562       // If we decided that it is *legal* to vectorize the loop, then do it.
10563 
10564       // Consider vectorizing the epilogue too if it's profitable.
10565       VectorizationFactor EpilogueVF =
10566           CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
10567       if (EpilogueVF.Width.isVector()) {
10568 
10569         // The first pass vectorizes the main loop and creates a scalar epilogue
10570         // to be vectorized by executing the plan (potentially with a different
10571         // factor) again shortly afterwards.
10572         EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
10573         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10574                                            EPI, &LVL, &CM, BFI, PSI, Checks);
10575 
10576         VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
10577         LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV,
10578                         DT);
10579         ++LoopsVectorized;
10580 
10581         // Second pass vectorizes the epilogue and adjusts the control flow
10582         // edges from the first pass.
10583         EPI.MainLoopVF = EPI.EpilogueVF;
10584         EPI.MainLoopUF = EPI.EpilogueUF;
10585         EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10586                                                  ORE, EPI, &LVL, &CM, BFI, PSI,
10587                                                  Checks);
10588 
10589         VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
10590         VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion();
10591         VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
10592         Header->setName("vec.epilog.vector.body");
10593 
10594         // Ensure that the start values for any VPReductionPHIRecipes are
10595         // updated before vectorising the epilogue loop.
10596         for (VPRecipeBase &R : Header->phis()) {
10597           if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10598             if (auto *Resume = MainILV.getReductionResumeValue(
10599                     ReductionPhi->getRecurrenceDescriptor())) {
10600               VPValue *StartVal = BestEpiPlan.getOrAddExternalDef(Resume);
10601               ReductionPhi->setOperand(0, StartVal);
10602             }
10603           }
10604         }
10605 
10606         LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10607                         DT);
10608         ++LoopsEpilogueVectorized;
10609 
10610         if (!MainILV.areSafetyChecksAdded())
10611           DisableRuntimeUnroll = true;
10612       } else {
10613         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
10614                                &LVL, &CM, BFI, PSI, Checks);
10615 
10616         VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10617         LVP.executePlan(VF.Width, IC, BestPlan, LB, DT);
10618         ++LoopsVectorized;
10619 
10620         // Add metadata to disable runtime unrolling a scalar loop when there
10621         // are no runtime checks about strides and memory. A scalar loop that is
10622         // rarely used is not worth unrolling.
10623         if (!LB.areSafetyChecksAdded())
10624           DisableRuntimeUnroll = true;
10625       }
10626       // Report the vectorization decision.
10627       ORE->emit([&]() {
10628         return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
10629                                   L->getHeader())
10630                << "vectorized loop (vectorization width: "
10631                << NV("VectorizationFactor", VF.Width)
10632                << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
10633       });
10634     }
10635 
10636     if (ORE->allowExtraAnalysis(LV_NAME))
10637       checkMixedPrecision(L, ORE);
10638   }
10639 
10640   Optional<MDNode *> RemainderLoopID =
10641       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
10642                                       LLVMLoopVectorizeFollowupEpilogue});
10643   if (RemainderLoopID.hasValue()) {
10644     L->setLoopID(RemainderLoopID.getValue());
10645   } else {
10646     if (DisableRuntimeUnroll)
10647       AddRuntimeUnrollDisableMetaData(L);
10648 
10649     // Mark the loop as already vectorized to avoid vectorizing again.
10650     Hints.setAlreadyVectorized();
10651   }
10652 
10653   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10654   return true;
10655 }
10656 
10657 LoopVectorizeResult LoopVectorizePass::runImpl(
10658     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
10659     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
10660     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
10661     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
10662     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
10663   SE = &SE_;
10664   LI = &LI_;
10665   TTI = &TTI_;
10666   DT = &DT_;
10667   BFI = &BFI_;
10668   TLI = TLI_;
10669   AA = &AA_;
10670   AC = &AC_;
10671   GetLAA = &GetLAA_;
10672   DB = &DB_;
10673   ORE = &ORE_;
10674   PSI = PSI_;
10675 
10676   // Don't attempt if
10677   // 1. the target claims to have no vector registers, and
10678   // 2. interleaving won't help ILP.
10679   //
10680   // The second condition is necessary because, even if the target has no
10681   // vector registers, loop vectorization may still enable scalar
10682   // interleaving.
10683   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10684       TTI->getMaxInterleaveFactor(1) < 2)
10685     return LoopVectorizeResult(false, false);
10686 
10687   bool Changed = false, CFGChanged = false;
10688 
10689   // The vectorizer requires loops to be in simplified form.
10690   // Since simplification may add new inner loops, it has to run before the
10691   // legality and profitability checks. This means running the loop vectorizer
10692   // will simplify all loops, regardless of whether anything end up being
10693   // vectorized.
10694   for (auto &L : *LI)
10695     Changed |= CFGChanged |=
10696         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10697 
10698   // Build up a worklist of inner-loops to vectorize. This is necessary as
10699   // the act of vectorizing or partially unrolling a loop creates new loops
10700   // and can invalidate iterators across the loops.
10701   SmallVector<Loop *, 8> Worklist;
10702 
10703   for (Loop *L : *LI)
10704     collectSupportedLoops(*L, LI, ORE, Worklist);
10705 
10706   LoopsAnalyzed += Worklist.size();
10707 
10708   // Now walk the identified inner loops.
10709   while (!Worklist.empty()) {
10710     Loop *L = Worklist.pop_back_val();
10711 
10712     // For the inner loops we actually process, form LCSSA to simplify the
10713     // transform.
10714     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10715 
10716     Changed |= CFGChanged |= processLoop(L);
10717   }
10718 
10719   // Process each loop nest in the function.
10720   return LoopVectorizeResult(Changed, CFGChanged);
10721 }
10722 
10723 PreservedAnalyses LoopVectorizePass::run(Function &F,
10724                                          FunctionAnalysisManager &AM) {
10725     auto &LI = AM.getResult<LoopAnalysis>(F);
10726     // There are no loops in the function. Return before computing other expensive
10727     // analyses.
10728     if (LI.empty())
10729       return PreservedAnalyses::all();
10730     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
10731     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
10732     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
10733     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
10734     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
10735     auto &AA = AM.getResult<AAManager>(F);
10736     auto &AC = AM.getResult<AssumptionAnalysis>(F);
10737     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
10738     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
10739 
10740     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
10741     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
10742         [&](Loop &L) -> const LoopAccessInfo & {
10743       LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,      SE,
10744                                         TLI, TTI, nullptr, nullptr, nullptr};
10745       return LAM.getResult<LoopAccessAnalysis>(L, AR);
10746     };
10747     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10748     ProfileSummaryInfo *PSI =
10749         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10750     LoopVectorizeResult Result =
10751         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
10752     if (!Result.MadeAnyChange)
10753       return PreservedAnalyses::all();
10754     PreservedAnalyses PA;
10755 
10756     // We currently do not preserve loopinfo/dominator analyses with outer loop
10757     // vectorization. Until this is addressed, mark these analyses as preserved
10758     // only for non-VPlan-native path.
10759     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
10760     if (!EnableVPlanNativePath) {
10761       PA.preserve<LoopAnalysis>();
10762       PA.preserve<DominatorTreeAnalysis>();
10763     }
10764 
10765     if (Result.MadeCFGChange) {
10766       // Making CFG changes likely means a loop got vectorized. Indicate that
10767       // extra simplification passes should be run.
10768       // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10769       // be run if runtime checks have been added.
10770       AM.getResult<ShouldRunExtraVectorPasses>(F);
10771       PA.preserve<ShouldRunExtraVectorPasses>();
10772     } else {
10773       PA.preserveSet<CFGAnalyses>();
10774     }
10775     return PA;
10776 }
10777 
10778 void LoopVectorizePass::printPipeline(
10779     raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10780   static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10781       OS, MapClassName2PassName);
10782 
10783   OS << "<";
10784   OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10785   OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10786   OS << ">";
10787 }
10788