1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SmallPtrSet.h"
73 #include "llvm/ADT/SmallSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
95 #include "llvm/Analysis/TargetLibraryInfo.h"
96 #include "llvm/Analysis/TargetTransformInfo.h"
97 #include "llvm/Analysis/VectorUtils.h"
98 #include "llvm/IR/Attributes.h"
99 #include "llvm/IR/BasicBlock.h"
100 #include "llvm/IR/CFG.h"
101 #include "llvm/IR/Constant.h"
102 #include "llvm/IR/Constants.h"
103 #include "llvm/IR/DataLayout.h"
104 #include "llvm/IR/DebugInfoMetadata.h"
105 #include "llvm/IR/DebugLoc.h"
106 #include "llvm/IR/DerivedTypes.h"
107 #include "llvm/IR/DiagnosticInfo.h"
108 #include "llvm/IR/Dominators.h"
109 #include "llvm/IR/Function.h"
110 #include "llvm/IR/IRBuilder.h"
111 #include "llvm/IR/InstrTypes.h"
112 #include "llvm/IR/Instruction.h"
113 #include "llvm/IR/Instructions.h"
114 #include "llvm/IR/IntrinsicInst.h"
115 #include "llvm/IR/Intrinsics.h"
116 #include "llvm/IR/LLVMContext.h"
117 #include "llvm/IR/Metadata.h"
118 #include "llvm/IR/Module.h"
119 #include "llvm/IR/Operator.h"
120 #include "llvm/IR/PatternMatch.h"
121 #include "llvm/IR/Type.h"
122 #include "llvm/IR/Use.h"
123 #include "llvm/IR/User.h"
124 #include "llvm/IR/Value.h"
125 #include "llvm/IR/ValueHandle.h"
126 #include "llvm/IR/Verifier.h"
127 #include "llvm/InitializePasses.h"
128 #include "llvm/Pass.h"
129 #include "llvm/Support/Casting.h"
130 #include "llvm/Support/CommandLine.h"
131 #include "llvm/Support/Compiler.h"
132 #include "llvm/Support/Debug.h"
133 #include "llvm/Support/ErrorHandling.h"
134 #include "llvm/Support/InstructionCost.h"
135 #include "llvm/Support/MathExtras.h"
136 #include "llvm/Support/raw_ostream.h"
137 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
138 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
139 #include "llvm/Transforms/Utils/LoopSimplify.h"
140 #include "llvm/Transforms/Utils/LoopUtils.h"
141 #include "llvm/Transforms/Utils/LoopVersioning.h"
142 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
143 #include "llvm/Transforms/Utils/SizeOpts.h"
144 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
145 #include <algorithm>
146 #include <cassert>
147 #include <cstdint>
148 #include <cstdlib>
149 #include <functional>
150 #include <iterator>
151 #include <limits>
152 #include <memory>
153 #include <string>
154 #include <tuple>
155 #include <utility>
156 
157 using namespace llvm;
158 
159 #define LV_NAME "loop-vectorize"
160 #define DEBUG_TYPE LV_NAME
161 
162 #ifndef NDEBUG
163 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
164 #endif
165 
166 /// @{
167 /// Metadata attribute names
168 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
169 const char LLVMLoopVectorizeFollowupVectorized[] =
170     "llvm.loop.vectorize.followup_vectorized";
171 const char LLVMLoopVectorizeFollowupEpilogue[] =
172     "llvm.loop.vectorize.followup_epilogue";
173 /// @}
174 
175 STATISTIC(LoopsVectorized, "Number of loops vectorized");
176 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
177 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
178 
179 static cl::opt<bool> EnableEpilogueVectorization(
180     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
181     cl::desc("Enable vectorization of epilogue loops."));
182 
183 static cl::opt<unsigned> EpilogueVectorizationForceVF(
184     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
185     cl::desc("When epilogue vectorization is enabled, and a value greater than "
186              "1 is specified, forces the given VF for all applicable epilogue "
187              "loops."));
188 
189 static cl::opt<unsigned> EpilogueVectorizationMinVF(
190     "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
191     cl::desc("Only loops with vectorization factor equal to or larger than "
192              "the specified value are considered for epilogue vectorization."));
193 
194 /// Loops with a known constant trip count below this number are vectorized only
195 /// if no scalar iteration overheads are incurred.
196 static cl::opt<unsigned> TinyTripCountVectorThreshold(
197     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
198     cl::desc("Loops with a constant trip count that is smaller than this "
199              "value are vectorized only if no scalar iteration overheads "
200              "are incurred."));
201 
202 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
203     "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
204     cl::desc("The maximum allowed number of runtime memory checks with a "
205              "vectorize(enable) pragma."));
206 
207 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
208 // that predication is preferred, and this lists all options. I.e., the
209 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
210 // and predicate the instructions accordingly. If tail-folding fails, there are
211 // different fallback strategies depending on these values:
212 namespace PreferPredicateTy {
213   enum Option {
214     ScalarEpilogue = 0,
215     PredicateElseScalarEpilogue,
216     PredicateOrDontVectorize
217   };
218 } // namespace PreferPredicateTy
219 
220 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
221     "prefer-predicate-over-epilogue",
222     cl::init(PreferPredicateTy::ScalarEpilogue),
223     cl::Hidden,
224     cl::desc("Tail-folding and predication preferences over creating a scalar "
225              "epilogue loop."),
226     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
227                          "scalar-epilogue",
228                          "Don't tail-predicate loops, create scalar epilogue"),
229               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
230                          "predicate-else-scalar-epilogue",
231                          "prefer tail-folding, create scalar epilogue if tail "
232                          "folding fails."),
233               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
234                          "predicate-dont-vectorize",
235                          "prefers tail-folding, don't attempt vectorization if "
236                          "tail-folding fails.")));
237 
238 static cl::opt<bool> MaximizeBandwidth(
239     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
240     cl::desc("Maximize bandwidth when selecting vectorization factor which "
241              "will be determined by the smallest type in loop."));
242 
243 static cl::opt<bool> EnableInterleavedMemAccesses(
244     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
245     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
246 
247 /// An interleave-group may need masking if it resides in a block that needs
248 /// predication, or in order to mask away gaps.
249 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
250     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
251     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
252 
253 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
254     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
255     cl::desc("We don't interleave loops with a estimated constant trip count "
256              "below this number"));
257 
258 static cl::opt<unsigned> ForceTargetNumScalarRegs(
259     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
260     cl::desc("A flag that overrides the target's number of scalar registers."));
261 
262 static cl::opt<unsigned> ForceTargetNumVectorRegs(
263     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
264     cl::desc("A flag that overrides the target's number of vector registers."));
265 
266 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
267     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
268     cl::desc("A flag that overrides the target's max interleave factor for "
269              "scalar loops."));
270 
271 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
272     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
273     cl::desc("A flag that overrides the target's max interleave factor for "
274              "vectorized loops."));
275 
276 static cl::opt<unsigned> ForceTargetInstructionCost(
277     "force-target-instruction-cost", cl::init(0), cl::Hidden,
278     cl::desc("A flag that overrides the target's expected cost for "
279              "an instruction to a single constant value. Mostly "
280              "useful for getting consistent testing."));
281 
282 static cl::opt<bool> ForceTargetSupportsScalableVectors(
283     "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
284     cl::desc(
285         "Pretend that scalable vectors are supported, even if the target does "
286         "not support them. This flag should only be used for testing."));
287 
288 static cl::opt<unsigned> SmallLoopCost(
289     "small-loop-cost", cl::init(20), cl::Hidden,
290     cl::desc(
291         "The cost of a loop that is considered 'small' by the interleaver."));
292 
293 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
294     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
295     cl::desc("Enable the use of the block frequency analysis to access PGO "
296              "heuristics minimizing code growth in cold regions and being more "
297              "aggressive in hot regions."));
298 
299 // Runtime interleave loops for load/store throughput.
300 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
301     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
302     cl::desc(
303         "Enable runtime interleaving until load/store ports are saturated"));
304 
305 /// Interleave small loops with scalar reductions.
306 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
307     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
308     cl::desc("Enable interleaving for loops with small iteration counts that "
309              "contain scalar reductions to expose ILP."));
310 
311 /// The number of stores in a loop that are allowed to need predication.
312 static cl::opt<unsigned> NumberOfStoresToPredicate(
313     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
314     cl::desc("Max number of stores to be predicated behind an if."));
315 
316 static cl::opt<bool> EnableIndVarRegisterHeur(
317     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
318     cl::desc("Count the induction variable only once when interleaving"));
319 
320 static cl::opt<bool> EnableCondStoresVectorization(
321     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
322     cl::desc("Enable if predication of stores during vectorization."));
323 
324 static cl::opt<unsigned> MaxNestedScalarReductionIC(
325     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
326     cl::desc("The maximum interleave count to use when interleaving a scalar "
327              "reduction in a nested loop."));
328 
329 static cl::opt<bool>
330     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
331                            cl::Hidden,
332                            cl::desc("Prefer in-loop vector reductions, "
333                                     "overriding the targets preference."));
334 
335 cl::opt<bool> EnableStrictReductions(
336     "enable-strict-reductions", cl::init(false), cl::Hidden,
337     cl::desc("Enable the vectorisation of loops with in-order (strict) "
338              "FP reductions"));
339 
340 static cl::opt<bool> PreferPredicatedReductionSelect(
341     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
342     cl::desc(
343         "Prefer predicating a reduction operation over an after loop select."));
344 
345 cl::opt<bool> EnableVPlanNativePath(
346     "enable-vplan-native-path", cl::init(false), cl::Hidden,
347     cl::desc("Enable VPlan-native vectorization path with "
348              "support for outer loop vectorization."));
349 
350 // FIXME: Remove this switch once we have divergence analysis. Currently we
351 // assume divergent non-backedge branches when this switch is true.
352 cl::opt<bool> EnableVPlanPredication(
353     "enable-vplan-predication", cl::init(false), cl::Hidden,
354     cl::desc("Enable VPlan-native vectorization path predicator with "
355              "support for outer loop vectorization."));
356 
357 // This flag enables the stress testing of the VPlan H-CFG construction in the
358 // VPlan-native vectorization path. It must be used in conjuction with
359 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
360 // verification of the H-CFGs built.
361 static cl::opt<bool> VPlanBuildStressTest(
362     "vplan-build-stress-test", cl::init(false), cl::Hidden,
363     cl::desc(
364         "Build VPlan for every supported loop nest in the function and bail "
365         "out right after the build (stress test the VPlan H-CFG construction "
366         "in the VPlan-native vectorization path)."));
367 
368 cl::opt<bool> llvm::EnableLoopInterleaving(
369     "interleave-loops", cl::init(true), cl::Hidden,
370     cl::desc("Enable loop interleaving in Loop vectorization passes"));
371 cl::opt<bool> llvm::EnableLoopVectorization(
372     "vectorize-loops", cl::init(true), cl::Hidden,
373     cl::desc("Run the Loop vectorization passes"));
374 
375 cl::opt<bool> PrintVPlansInDotFormat(
376     "vplan-print-in-dot-format", cl::init(false), cl::Hidden,
377     cl::desc("Use dot format instead of plain text when dumping VPlans"));
378 
379 /// A helper function that returns true if the given type is irregular. The
380 /// type is irregular if its allocated size doesn't equal the store size of an
381 /// element of the corresponding vector type.
382 static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
383   // Determine if an array of N elements of type Ty is "bitcast compatible"
384   // with a <N x Ty> vector.
385   // This is only true if there is no padding between the array elements.
386   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
387 }
388 
389 /// A helper function that returns the reciprocal of the block probability of
390 /// predicated blocks. If we return X, we are assuming the predicated block
391 /// will execute once for every X iterations of the loop header.
392 ///
393 /// TODO: We should use actual block probability here, if available. Currently,
394 ///       we always assume predicated blocks have a 50% chance of executing.
395 static unsigned getReciprocalPredBlockProb() { return 2; }
396 
397 /// A helper function that returns an integer or floating-point constant with
398 /// value C.
399 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
400   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
401                            : ConstantFP::get(Ty, C);
402 }
403 
404 /// Returns "best known" trip count for the specified loop \p L as defined by
405 /// the following procedure:
406 ///   1) Returns exact trip count if it is known.
407 ///   2) Returns expected trip count according to profile data if any.
408 ///   3) Returns upper bound estimate if it is known.
409 ///   4) Returns None if all of the above failed.
410 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
411   // Check if exact trip count is known.
412   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
413     return ExpectedTC;
414 
415   // Check if there is an expected trip count available from profile data.
416   if (LoopVectorizeWithBlockFrequency)
417     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
418       return EstimatedTC;
419 
420   // Check if upper bound estimate is known.
421   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
422     return ExpectedTC;
423 
424   return None;
425 }
426 
427 // Forward declare GeneratedRTChecks.
428 class GeneratedRTChecks;
429 
430 namespace llvm {
431 
432 /// InnerLoopVectorizer vectorizes loops which contain only one basic
433 /// block to a specified vectorization factor (VF).
434 /// This class performs the widening of scalars into vectors, or multiple
435 /// scalars. This class also implements the following features:
436 /// * It inserts an epilogue loop for handling loops that don't have iteration
437 ///   counts that are known to be a multiple of the vectorization factor.
438 /// * It handles the code generation for reduction variables.
439 /// * Scalarization (implementation using scalars) of un-vectorizable
440 ///   instructions.
441 /// InnerLoopVectorizer does not perform any vectorization-legality
442 /// checks, and relies on the caller to check for the different legality
443 /// aspects. The InnerLoopVectorizer relies on the
444 /// LoopVectorizationLegality class to provide information about the induction
445 /// and reduction variables that were found to a given vectorization factor.
446 class InnerLoopVectorizer {
447 public:
448   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
449                       LoopInfo *LI, DominatorTree *DT,
450                       const TargetLibraryInfo *TLI,
451                       const TargetTransformInfo *TTI, AssumptionCache *AC,
452                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
453                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
454                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
455                       ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
456       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
457         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
458         Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
459         PSI(PSI), RTChecks(RTChecks) {
460     // Query this against the original loop and save it here because the profile
461     // of the original loop header may change as the transformation happens.
462     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
463         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
464   }
465 
466   virtual ~InnerLoopVectorizer() = default;
467 
468   /// Create a new empty loop that will contain vectorized instructions later
469   /// on, while the old loop will be used as the scalar remainder. Control flow
470   /// is generated around the vectorized (and scalar epilogue) loops consisting
471   /// of various checks and bypasses. Return the pre-header block of the new
472   /// loop.
473   /// In the case of epilogue vectorization, this function is overriden to
474   /// handle the more complex control flow around the loops.
475   virtual BasicBlock *createVectorizedLoopSkeleton();
476 
477   /// Widen a single instruction within the innermost loop.
478   void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands,
479                         VPTransformState &State);
480 
481   /// Widen a single call instruction within the innermost loop.
482   void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
483                             VPTransformState &State);
484 
485   /// Widen a single select instruction within the innermost loop.
486   void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands,
487                               bool InvariantCond, VPTransformState &State);
488 
489   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
490   void fixVectorizedLoop(VPTransformState &State);
491 
492   // Return true if any runtime check is added.
493   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
494 
495   /// A type for vectorized values in the new loop. Each value from the
496   /// original loop, when vectorized, is represented by UF vector values in the
497   /// new unrolled loop, where UF is the unroll factor.
498   using VectorParts = SmallVector<Value *, 2>;
499 
500   /// Vectorize a single GetElementPtrInst based on information gathered and
501   /// decisions taken during planning.
502   void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices,
503                 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant,
504                 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
505 
506   /// Vectorize a single PHINode in a block. This method handles the induction
507   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
508   /// arbitrary length vectors.
509   void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc,
510                            VPWidenPHIRecipe *PhiR, VPTransformState &State);
511 
512   /// A helper function to scalarize a single Instruction in the innermost loop.
513   /// Generates a sequence of scalar instances for each lane between \p MinLane
514   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
515   /// inclusive. Uses the VPValue operands from \p Operands instead of \p
516   /// Instr's operands.
517   void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands,
518                             const VPIteration &Instance, bool IfPredicateInstr,
519                             VPTransformState &State);
520 
521   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
522   /// is provided, the integer induction variable will first be truncated to
523   /// the corresponding type.
524   void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc,
525                              VPValue *Def, VPValue *CastDef,
526                              VPTransformState &State);
527 
528   /// Construct the vector value of a scalarized value \p V one lane at a time.
529   void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
530                                  VPTransformState &State);
531 
532   /// Try to vectorize interleaved access group \p Group with the base address
533   /// given in \p Addr, optionally masking the vector operations if \p
534   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
535   /// values in the vectorized loop.
536   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
537                                 ArrayRef<VPValue *> VPDefs,
538                                 VPTransformState &State, VPValue *Addr,
539                                 ArrayRef<VPValue *> StoredValues,
540                                 VPValue *BlockInMask = nullptr);
541 
542   /// Vectorize Load and Store instructions with the base address given in \p
543   /// Addr, optionally masking the vector operations if \p BlockInMask is
544   /// non-null. Use \p State to translate given VPValues to IR values in the
545   /// vectorized loop.
546   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
547                                   VPValue *Def, VPValue *Addr,
548                                   VPValue *StoredValue, VPValue *BlockInMask);
549 
550   /// Set the debug location in the builder \p Ptr using the debug location in
551   /// \p V. If \p Ptr is None then it uses the class member's Builder.
552   void setDebugLocFromInst(const Value *V,
553                            Optional<IRBuilder<> *> CustomBuilder = None);
554 
555   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
556   void fixNonInductionPHIs(VPTransformState &State);
557 
558   /// Returns true if the reordering of FP operations is not allowed, but we are
559   /// able to vectorize with strict in-order reductions for the given RdxDesc.
560   bool useOrderedReductions(RecurrenceDescriptor &RdxDesc);
561 
562   /// Create a broadcast instruction. This method generates a broadcast
563   /// instruction (shuffle) for loop invariant values and for the induction
564   /// value. If this is the induction variable then we extend it to N, N+1, ...
565   /// this is needed because each iteration in the loop corresponds to a SIMD
566   /// element.
567   virtual Value *getBroadcastInstrs(Value *V);
568 
569 protected:
570   friend class LoopVectorizationPlanner;
571 
572   /// A small list of PHINodes.
573   using PhiVector = SmallVector<PHINode *, 4>;
574 
575   /// A type for scalarized values in the new loop. Each value from the
576   /// original loop, when scalarized, is represented by UF x VF scalar values
577   /// in the new unrolled loop, where UF is the unroll factor and VF is the
578   /// vectorization factor.
579   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
580 
581   /// Set up the values of the IVs correctly when exiting the vector loop.
582   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
583                     Value *CountRoundDown, Value *EndValue,
584                     BasicBlock *MiddleBlock);
585 
586   /// Create a new induction variable inside L.
587   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
588                                    Value *Step, Instruction *DL);
589 
590   /// Handle all cross-iteration phis in the header.
591   void fixCrossIterationPHIs(VPTransformState &State);
592 
593   /// Fix a first-order recurrence. This is the second phase of vectorizing
594   /// this phi node.
595   void fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, VPTransformState &State);
596 
597   /// Fix a reduction cross-iteration phi. This is the second phase of
598   /// vectorizing this phi node.
599   void fixReduction(VPWidenPHIRecipe *Phi, VPTransformState &State);
600 
601   /// Clear NSW/NUW flags from reduction instructions if necessary.
602   void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
603                                VPTransformState &State);
604 
605   /// Fixup the LCSSA phi nodes in the unique exit block.  This simply
606   /// means we need to add the appropriate incoming value from the middle
607   /// block as exiting edges from the scalar epilogue loop (if present) are
608   /// already in place, and we exit the vector loop exclusively to the middle
609   /// block.
610   void fixLCSSAPHIs(VPTransformState &State);
611 
612   /// Iteratively sink the scalarized operands of a predicated instruction into
613   /// the block that was created for it.
614   void sinkScalarOperands(Instruction *PredInst);
615 
616   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
617   /// represented as.
618   void truncateToMinimalBitwidths(VPTransformState &State);
619 
620   /// This function adds
621   /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
622   /// to each vector element of Val. The sequence starts at StartIndex.
623   /// \p Opcode is relevant for FP induction variable.
624   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
625                                Instruction::BinaryOps Opcode =
626                                Instruction::BinaryOpsEnd);
627 
628   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
629   /// variable on which to base the steps, \p Step is the size of the step, and
630   /// \p EntryVal is the value from the original loop that maps to the steps.
631   /// Note that \p EntryVal doesn't have to be an induction variable - it
632   /// can also be a truncate instruction.
633   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
634                         const InductionDescriptor &ID, VPValue *Def,
635                         VPValue *CastDef, VPTransformState &State);
636 
637   /// Create a vector induction phi node based on an existing scalar one. \p
638   /// EntryVal is the value from the original loop that maps to the vector phi
639   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
640   /// truncate instruction, instead of widening the original IV, we widen a
641   /// version of the IV truncated to \p EntryVal's type.
642   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
643                                        Value *Step, Value *Start,
644                                        Instruction *EntryVal, VPValue *Def,
645                                        VPValue *CastDef,
646                                        VPTransformState &State);
647 
648   /// Returns true if an instruction \p I should be scalarized instead of
649   /// vectorized for the chosen vectorization factor.
650   bool shouldScalarizeInstruction(Instruction *I) const;
651 
652   /// Returns true if we should generate a scalar version of \p IV.
653   bool needsScalarInduction(Instruction *IV) const;
654 
655   /// If there is a cast involved in the induction variable \p ID, which should
656   /// be ignored in the vectorized loop body, this function records the
657   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
658   /// cast. We had already proved that the casted Phi is equal to the uncasted
659   /// Phi in the vectorized loop (under a runtime guard), and therefore
660   /// there is no need to vectorize the cast - the same value can be used in the
661   /// vector loop for both the Phi and the cast.
662   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
663   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
664   ///
665   /// \p EntryVal is the value from the original loop that maps to the vector
666   /// phi node and is used to distinguish what is the IV currently being
667   /// processed - original one (if \p EntryVal is a phi corresponding to the
668   /// original IV) or the "newly-created" one based on the proof mentioned above
669   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
670   /// latter case \p EntryVal is a TruncInst and we must not record anything for
671   /// that IV, but it's error-prone to expect callers of this routine to care
672   /// about that, hence this explicit parameter.
673   void recordVectorLoopValueForInductionCast(
674       const InductionDescriptor &ID, const Instruction *EntryVal,
675       Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State,
676       unsigned Part, unsigned Lane = UINT_MAX);
677 
678   /// Generate a shuffle sequence that will reverse the vector Vec.
679   virtual Value *reverseVector(Value *Vec);
680 
681   /// Returns (and creates if needed) the original loop trip count.
682   Value *getOrCreateTripCount(Loop *NewLoop);
683 
684   /// Returns (and creates if needed) the trip count of the widened loop.
685   Value *getOrCreateVectorTripCount(Loop *NewLoop);
686 
687   /// Returns a bitcasted value to the requested vector type.
688   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
689   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
690                                 const DataLayout &DL);
691 
692   /// Emit a bypass check to see if the vector trip count is zero, including if
693   /// it overflows.
694   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
695 
696   /// Emit a bypass check to see if all of the SCEV assumptions we've
697   /// had to make are correct. Returns the block containing the checks or
698   /// nullptr if no checks have been added.
699   BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass);
700 
701   /// Emit bypass checks to check any memory assumptions we may have made.
702   /// Returns the block containing the checks or nullptr if no checks have been
703   /// added.
704   BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
705 
706   /// Compute the transformed value of Index at offset StartValue using step
707   /// StepValue.
708   /// For integer induction, returns StartValue + Index * StepValue.
709   /// For pointer induction, returns StartValue[Index * StepValue].
710   /// FIXME: The newly created binary instructions should contain nsw/nuw
711   /// flags, which can be found from the original scalar operations.
712   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
713                               const DataLayout &DL,
714                               const InductionDescriptor &ID) const;
715 
716   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
717   /// vector loop preheader, middle block and scalar preheader. Also
718   /// allocate a loop object for the new vector loop and return it.
719   Loop *createVectorLoopSkeleton(StringRef Prefix);
720 
721   /// Create new phi nodes for the induction variables to resume iteration count
722   /// in the scalar epilogue, from where the vectorized loop left off (given by
723   /// \p VectorTripCount).
724   /// In cases where the loop skeleton is more complicated (eg. epilogue
725   /// vectorization) and the resume values can come from an additional bypass
726   /// block, the \p AdditionalBypass pair provides information about the bypass
727   /// block and the end value on the edge from bypass to this loop.
728   void createInductionResumeValues(
729       Loop *L, Value *VectorTripCount,
730       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
731 
732   /// Complete the loop skeleton by adding debug MDs, creating appropriate
733   /// conditional branches in the middle block, preparing the builder and
734   /// running the verifier. Take in the vector loop \p L as argument, and return
735   /// the preheader of the completed vector loop.
736   BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
737 
738   /// Add additional metadata to \p To that was not present on \p Orig.
739   ///
740   /// Currently this is used to add the noalias annotations based on the
741   /// inserted memchecks.  Use this for instructions that are *cloned* into the
742   /// vector loop.
743   void addNewMetadata(Instruction *To, const Instruction *Orig);
744 
745   /// Add metadata from one instruction to another.
746   ///
747   /// This includes both the original MDs from \p From and additional ones (\see
748   /// addNewMetadata).  Use this for *newly created* instructions in the vector
749   /// loop.
750   void addMetadata(Instruction *To, Instruction *From);
751 
752   /// Similar to the previous function but it adds the metadata to a
753   /// vector of instructions.
754   void addMetadata(ArrayRef<Value *> To, Instruction *From);
755 
756   /// Allow subclasses to override and print debug traces before/after vplan
757   /// execution, when trace information is requested.
758   virtual void printDebugTracesAtStart(){};
759   virtual void printDebugTracesAtEnd(){};
760 
761   /// The original loop.
762   Loop *OrigLoop;
763 
764   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
765   /// dynamic knowledge to simplify SCEV expressions and converts them to a
766   /// more usable form.
767   PredicatedScalarEvolution &PSE;
768 
769   /// Loop Info.
770   LoopInfo *LI;
771 
772   /// Dominator Tree.
773   DominatorTree *DT;
774 
775   /// Alias Analysis.
776   AAResults *AA;
777 
778   /// Target Library Info.
779   const TargetLibraryInfo *TLI;
780 
781   /// Target Transform Info.
782   const TargetTransformInfo *TTI;
783 
784   /// Assumption Cache.
785   AssumptionCache *AC;
786 
787   /// Interface to emit optimization remarks.
788   OptimizationRemarkEmitter *ORE;
789 
790   /// LoopVersioning.  It's only set up (non-null) if memchecks were
791   /// used.
792   ///
793   /// This is currently only used to add no-alias metadata based on the
794   /// memchecks.  The actually versioning is performed manually.
795   std::unique_ptr<LoopVersioning> LVer;
796 
797   /// The vectorization SIMD factor to use. Each vector will have this many
798   /// vector elements.
799   ElementCount VF;
800 
801   /// The vectorization unroll factor to use. Each scalar is vectorized to this
802   /// many different vector instructions.
803   unsigned UF;
804 
805   /// The builder that we use
806   IRBuilder<> Builder;
807 
808   // --- Vectorization state ---
809 
810   /// The vector-loop preheader.
811   BasicBlock *LoopVectorPreHeader;
812 
813   /// The scalar-loop preheader.
814   BasicBlock *LoopScalarPreHeader;
815 
816   /// Middle Block between the vector and the scalar.
817   BasicBlock *LoopMiddleBlock;
818 
819   /// The (unique) ExitBlock of the scalar loop.  Note that
820   /// there can be multiple exiting edges reaching this block.
821   BasicBlock *LoopExitBlock;
822 
823   /// The vector loop body.
824   BasicBlock *LoopVectorBody;
825 
826   /// The scalar loop body.
827   BasicBlock *LoopScalarBody;
828 
829   /// A list of all bypass blocks. The first block is the entry of the loop.
830   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
831 
832   /// The new Induction variable which was added to the new block.
833   PHINode *Induction = nullptr;
834 
835   /// The induction variable of the old basic block.
836   PHINode *OldInduction = nullptr;
837 
838   /// Store instructions that were predicated.
839   SmallVector<Instruction *, 4> PredicatedInstructions;
840 
841   /// Trip count of the original loop.
842   Value *TripCount = nullptr;
843 
844   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
845   Value *VectorTripCount = nullptr;
846 
847   /// The legality analysis.
848   LoopVectorizationLegality *Legal;
849 
850   /// The profitablity analysis.
851   LoopVectorizationCostModel *Cost;
852 
853   // Record whether runtime checks are added.
854   bool AddedSafetyChecks = false;
855 
856   // Holds the end values for each induction variable. We save the end values
857   // so we can later fix-up the external users of the induction variables.
858   DenseMap<PHINode *, Value *> IVEndValues;
859 
860   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
861   // fixed up at the end of vector code generation.
862   SmallVector<PHINode *, 8> OrigPHIsToFix;
863 
864   /// BFI and PSI are used to check for profile guided size optimizations.
865   BlockFrequencyInfo *BFI;
866   ProfileSummaryInfo *PSI;
867 
868   // Whether this loop should be optimized for size based on profile guided size
869   // optimizatios.
870   bool OptForSizeBasedOnProfile;
871 
872   /// Structure to hold information about generated runtime checks, responsible
873   /// for cleaning the checks, if vectorization turns out unprofitable.
874   GeneratedRTChecks &RTChecks;
875 };
876 
877 class InnerLoopUnroller : public InnerLoopVectorizer {
878 public:
879   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
880                     LoopInfo *LI, DominatorTree *DT,
881                     const TargetLibraryInfo *TLI,
882                     const TargetTransformInfo *TTI, AssumptionCache *AC,
883                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
884                     LoopVectorizationLegality *LVL,
885                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
886                     ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
887       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
888                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
889                             BFI, PSI, Check) {}
890 
891 private:
892   Value *getBroadcastInstrs(Value *V) override;
893   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
894                        Instruction::BinaryOps Opcode =
895                        Instruction::BinaryOpsEnd) override;
896   Value *reverseVector(Value *Vec) override;
897 };
898 
899 /// Encapsulate information regarding vectorization of a loop and its epilogue.
900 /// This information is meant to be updated and used across two stages of
901 /// epilogue vectorization.
902 struct EpilogueLoopVectorizationInfo {
903   ElementCount MainLoopVF = ElementCount::getFixed(0);
904   unsigned MainLoopUF = 0;
905   ElementCount EpilogueVF = ElementCount::getFixed(0);
906   unsigned EpilogueUF = 0;
907   BasicBlock *MainLoopIterationCountCheck = nullptr;
908   BasicBlock *EpilogueIterationCountCheck = nullptr;
909   BasicBlock *SCEVSafetyCheck = nullptr;
910   BasicBlock *MemSafetyCheck = nullptr;
911   Value *TripCount = nullptr;
912   Value *VectorTripCount = nullptr;
913 
914   EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF,
915                                 unsigned EUF)
916       : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF),
917         EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) {
918     assert(EUF == 1 &&
919            "A high UF for the epilogue loop is likely not beneficial.");
920   }
921 };
922 
923 /// An extension of the inner loop vectorizer that creates a skeleton for a
924 /// vectorized loop that has its epilogue (residual) also vectorized.
925 /// The idea is to run the vplan on a given loop twice, firstly to setup the
926 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
927 /// from the first step and vectorize the epilogue.  This is achieved by
928 /// deriving two concrete strategy classes from this base class and invoking
929 /// them in succession from the loop vectorizer planner.
930 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
931 public:
932   InnerLoopAndEpilogueVectorizer(
933       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
934       DominatorTree *DT, const TargetLibraryInfo *TLI,
935       const TargetTransformInfo *TTI, AssumptionCache *AC,
936       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
937       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
938       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
939       GeneratedRTChecks &Checks)
940       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
941                             EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI,
942                             Checks),
943         EPI(EPI) {}
944 
945   // Override this function to handle the more complex control flow around the
946   // three loops.
947   BasicBlock *createVectorizedLoopSkeleton() final override {
948     return createEpilogueVectorizedLoopSkeleton();
949   }
950 
951   /// The interface for creating a vectorized skeleton using one of two
952   /// different strategies, each corresponding to one execution of the vplan
953   /// as described above.
954   virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0;
955 
956   /// Holds and updates state information required to vectorize the main loop
957   /// and its epilogue in two separate passes. This setup helps us avoid
958   /// regenerating and recomputing runtime safety checks. It also helps us to
959   /// shorten the iteration-count-check path length for the cases where the
960   /// iteration count of the loop is so small that the main vector loop is
961   /// completely skipped.
962   EpilogueLoopVectorizationInfo &EPI;
963 };
964 
965 /// A specialized derived class of inner loop vectorizer that performs
966 /// vectorization of *main* loops in the process of vectorizing loops and their
967 /// epilogues.
968 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
969 public:
970   EpilogueVectorizerMainLoop(
971       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
972       DominatorTree *DT, const TargetLibraryInfo *TLI,
973       const TargetTransformInfo *TTI, AssumptionCache *AC,
974       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
975       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
976       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
977       GeneratedRTChecks &Check)
978       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
979                                        EPI, LVL, CM, BFI, PSI, Check) {}
980   /// Implements the interface for creating a vectorized skeleton using the
981   /// *main loop* strategy (ie the first pass of vplan execution).
982   BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
983 
984 protected:
985   /// Emits an iteration count bypass check once for the main loop (when \p
986   /// ForEpilogue is false) and once for the epilogue loop (when \p
987   /// ForEpilogue is true).
988   BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass,
989                                              bool ForEpilogue);
990   void printDebugTracesAtStart() override;
991   void printDebugTracesAtEnd() override;
992 };
993 
994 // A specialized derived class of inner loop vectorizer that performs
995 // vectorization of *epilogue* loops in the process of vectorizing loops and
996 // their epilogues.
997 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
998 public:
999   EpilogueVectorizerEpilogueLoop(
1000       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
1001       DominatorTree *DT, const TargetLibraryInfo *TLI,
1002       const TargetTransformInfo *TTI, AssumptionCache *AC,
1003       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
1004       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
1005       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
1006       GeneratedRTChecks &Checks)
1007       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
1008                                        EPI, LVL, CM, BFI, PSI, Checks) {}
1009   /// Implements the interface for creating a vectorized skeleton using the
1010   /// *epilogue loop* strategy (ie the second pass of vplan execution).
1011   BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
1012 
1013 protected:
1014   /// Emits an iteration count bypass check after the main vector loop has
1015   /// finished to see if there are any iterations left to execute by either
1016   /// the vector epilogue or the scalar epilogue.
1017   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L,
1018                                                       BasicBlock *Bypass,
1019                                                       BasicBlock *Insert);
1020   void printDebugTracesAtStart() override;
1021   void printDebugTracesAtEnd() override;
1022 };
1023 } // end namespace llvm
1024 
1025 /// Look for a meaningful debug location on the instruction or it's
1026 /// operands.
1027 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
1028   if (!I)
1029     return I;
1030 
1031   DebugLoc Empty;
1032   if (I->getDebugLoc() != Empty)
1033     return I;
1034 
1035   for (Use &Op : I->operands()) {
1036     if (Instruction *OpInst = dyn_cast<Instruction>(Op))
1037       if (OpInst->getDebugLoc() != Empty)
1038         return OpInst;
1039   }
1040 
1041   return I;
1042 }
1043 
1044 void InnerLoopVectorizer::setDebugLocFromInst(
1045     const Value *V, Optional<IRBuilder<> *> CustomBuilder) {
1046   IRBuilder<> *B = (CustomBuilder == None) ? &Builder : *CustomBuilder;
1047   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) {
1048     const DILocation *DIL = Inst->getDebugLoc();
1049 
1050     // When a FSDiscriminator is enabled, we don't need to add the multiply
1051     // factors to the discriminators.
1052     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
1053         !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) {
1054       // FIXME: For scalable vectors, assume vscale=1.
1055       auto NewDIL =
1056           DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
1057       if (NewDIL)
1058         B->SetCurrentDebugLocation(NewDIL.getValue());
1059       else
1060         LLVM_DEBUG(dbgs()
1061                    << "Failed to create new discriminator: "
1062                    << DIL->getFilename() << " Line: " << DIL->getLine());
1063     } else
1064       B->SetCurrentDebugLocation(DIL);
1065   } else
1066     B->SetCurrentDebugLocation(DebugLoc());
1067 }
1068 
1069 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
1070 /// is passed, the message relates to that particular instruction.
1071 #ifndef NDEBUG
1072 static void debugVectorizationMessage(const StringRef Prefix,
1073                                       const StringRef DebugMsg,
1074                                       Instruction *I) {
1075   dbgs() << "LV: " << Prefix << DebugMsg;
1076   if (I != nullptr)
1077     dbgs() << " " << *I;
1078   else
1079     dbgs() << '.';
1080   dbgs() << '\n';
1081 }
1082 #endif
1083 
1084 /// Create an analysis remark that explains why vectorization failed
1085 ///
1086 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
1087 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
1088 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
1089 /// the location of the remark.  \return the remark object that can be
1090 /// streamed to.
1091 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
1092     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
1093   Value *CodeRegion = TheLoop->getHeader();
1094   DebugLoc DL = TheLoop->getStartLoc();
1095 
1096   if (I) {
1097     CodeRegion = I->getParent();
1098     // If there is no debug location attached to the instruction, revert back to
1099     // using the loop's.
1100     if (I->getDebugLoc())
1101       DL = I->getDebugLoc();
1102   }
1103 
1104   return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
1105 }
1106 
1107 /// Return a value for Step multiplied by VF.
1108 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) {
1109   assert(isa<ConstantInt>(Step) && "Expected an integer step");
1110   Constant *StepVal = ConstantInt::get(
1111       Step->getType(),
1112       cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue());
1113   return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
1114 }
1115 
1116 namespace llvm {
1117 
1118 /// Return the runtime value for VF.
1119 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) {
1120   Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
1121   return VF.isScalable() ? B.CreateVScale(EC) : EC;
1122 }
1123 
1124 void reportVectorizationFailure(const StringRef DebugMsg,
1125                                 const StringRef OREMsg, const StringRef ORETag,
1126                                 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1127                                 Instruction *I) {
1128   LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
1129   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1130   ORE->emit(
1131       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1132       << "loop not vectorized: " << OREMsg);
1133 }
1134 
1135 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1136                              OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1137                              Instruction *I) {
1138   LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
1139   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1140   ORE->emit(
1141       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1142       << Msg);
1143 }
1144 
1145 } // end namespace llvm
1146 
1147 #ifndef NDEBUG
1148 /// \return string containing a file name and a line # for the given loop.
1149 static std::string getDebugLocString(const Loop *L) {
1150   std::string Result;
1151   if (L) {
1152     raw_string_ostream OS(Result);
1153     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1154       LoopDbgLoc.print(OS);
1155     else
1156       // Just print the module name.
1157       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1158     OS.flush();
1159   }
1160   return Result;
1161 }
1162 #endif
1163 
1164 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
1165                                          const Instruction *Orig) {
1166   // If the loop was versioned with memchecks, add the corresponding no-alias
1167   // metadata.
1168   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
1169     LVer->annotateInstWithNoAlias(To, Orig);
1170 }
1171 
1172 void InnerLoopVectorizer::addMetadata(Instruction *To,
1173                                       Instruction *From) {
1174   propagateMetadata(To, From);
1175   addNewMetadata(To, From);
1176 }
1177 
1178 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1179                                       Instruction *From) {
1180   for (Value *V : To) {
1181     if (Instruction *I = dyn_cast<Instruction>(V))
1182       addMetadata(I, From);
1183   }
1184 }
1185 
1186 namespace llvm {
1187 
1188 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1189 // lowered.
1190 enum ScalarEpilogueLowering {
1191 
1192   // The default: allowing scalar epilogues.
1193   CM_ScalarEpilogueAllowed,
1194 
1195   // Vectorization with OptForSize: don't allow epilogues.
1196   CM_ScalarEpilogueNotAllowedOptSize,
1197 
1198   // A special case of vectorisation with OptForSize: loops with a very small
1199   // trip count are considered for vectorization under OptForSize, thereby
1200   // making sure the cost of their loop body is dominant, free of runtime
1201   // guards and scalar iteration overheads.
1202   CM_ScalarEpilogueNotAllowedLowTripLoop,
1203 
1204   // Loop hint predicate indicating an epilogue is undesired.
1205   CM_ScalarEpilogueNotNeededUsePredicate,
1206 
1207   // Directive indicating we must either tail fold or not vectorize
1208   CM_ScalarEpilogueNotAllowedUsePredicate
1209 };
1210 
1211 /// ElementCountComparator creates a total ordering for ElementCount
1212 /// for the purposes of using it in a set structure.
1213 struct ElementCountComparator {
1214   bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {
1215     return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
1216            std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
1217   }
1218 };
1219 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;
1220 
1221 /// LoopVectorizationCostModel - estimates the expected speedups due to
1222 /// vectorization.
1223 /// In many cases vectorization is not profitable. This can happen because of
1224 /// a number of reasons. In this class we mainly attempt to predict the
1225 /// expected speedup/slowdowns due to the supported instruction set. We use the
1226 /// TargetTransformInfo to query the different backends for the cost of
1227 /// different operations.
1228 class LoopVectorizationCostModel {
1229 public:
1230   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1231                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1232                              LoopVectorizationLegality *Legal,
1233                              const TargetTransformInfo &TTI,
1234                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1235                              AssumptionCache *AC,
1236                              OptimizationRemarkEmitter *ORE, const Function *F,
1237                              const LoopVectorizeHints *Hints,
1238                              InterleavedAccessInfo &IAI)
1239       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1240         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1241         Hints(Hints), InterleaveInfo(IAI) {}
1242 
1243   /// \return An upper bound for the vectorization factors (both fixed and
1244   /// scalable). If the factors are 0, vectorization and interleaving should be
1245   /// avoided up front.
1246   FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1247 
1248   /// \return True if runtime checks are required for vectorization, and false
1249   /// otherwise.
1250   bool runtimeChecksRequired();
1251 
1252   /// \return The most profitable vectorization factor and the cost of that VF.
1253   /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO
1254   /// then this vectorization factor will be selected if vectorization is
1255   /// possible.
1256   VectorizationFactor
1257   selectVectorizationFactor(const ElementCountSet &CandidateVFs);
1258 
1259   VectorizationFactor
1260   selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1261                                     const LoopVectorizationPlanner &LVP);
1262 
1263   /// Setup cost-based decisions for user vectorization factor.
1264   void selectUserVectorizationFactor(ElementCount UserVF) {
1265     collectUniformsAndScalars(UserVF);
1266     collectInstsToScalarize(UserVF);
1267   }
1268 
1269   /// \return The size (in bits) of the smallest and widest types in the code
1270   /// that needs to be vectorized. We ignore values that remain scalar such as
1271   /// 64 bit loop indices.
1272   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1273 
1274   /// \return The desired interleave count.
1275   /// If interleave count has been specified by metadata it will be returned.
1276   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1277   /// are the selected vectorization factor and the cost of the selected VF.
1278   unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1279 
1280   /// Memory access instruction may be vectorized in more than one way.
1281   /// Form of instruction after vectorization depends on cost.
1282   /// This function takes cost-based decisions for Load/Store instructions
1283   /// and collects them in a map. This decisions map is used for building
1284   /// the lists of loop-uniform and loop-scalar instructions.
1285   /// The calculated cost is saved with widening decision in order to
1286   /// avoid redundant calculations.
1287   void setCostBasedWideningDecision(ElementCount VF);
1288 
1289   /// A struct that represents some properties of the register usage
1290   /// of a loop.
1291   struct RegisterUsage {
1292     /// Holds the number of loop invariant values that are used in the loop.
1293     /// The key is ClassID of target-provided register class.
1294     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1295     /// Holds the maximum number of concurrent live intervals in the loop.
1296     /// The key is ClassID of target-provided register class.
1297     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1298   };
1299 
1300   /// \return Returns information about the register usages of the loop for the
1301   /// given vectorization factors.
1302   SmallVector<RegisterUsage, 8>
1303   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1304 
1305   /// Collect values we want to ignore in the cost model.
1306   void collectValuesToIgnore();
1307 
1308   /// Split reductions into those that happen in the loop, and those that happen
1309   /// outside. In loop reductions are collected into InLoopReductionChains.
1310   void collectInLoopReductions();
1311 
1312   /// Returns true if we should use strict in-order reductions for the given
1313   /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1314   /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1315   /// of FP operations.
1316   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) {
1317     return EnableStrictReductions && !Hints->allowReordering() &&
1318            RdxDesc.isOrdered();
1319   }
1320 
1321   /// \returns The smallest bitwidth each instruction can be represented with.
1322   /// The vector equivalents of these instructions should be truncated to this
1323   /// type.
1324   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1325     return MinBWs;
1326   }
1327 
1328   /// \returns True if it is more profitable to scalarize instruction \p I for
1329   /// vectorization factor \p VF.
1330   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1331     assert(VF.isVector() &&
1332            "Profitable to scalarize relevant only for VF > 1.");
1333 
1334     // Cost model is not run in the VPlan-native path - return conservative
1335     // result until this changes.
1336     if (EnableVPlanNativePath)
1337       return false;
1338 
1339     auto Scalars = InstsToScalarize.find(VF);
1340     assert(Scalars != InstsToScalarize.end() &&
1341            "VF not yet analyzed for scalarization profitability");
1342     return Scalars->second.find(I) != Scalars->second.end();
1343   }
1344 
1345   /// Returns true if \p I is known to be uniform after vectorization.
1346   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1347     if (VF.isScalar())
1348       return true;
1349 
1350     // Cost model is not run in the VPlan-native path - return conservative
1351     // result until this changes.
1352     if (EnableVPlanNativePath)
1353       return false;
1354 
1355     auto UniformsPerVF = Uniforms.find(VF);
1356     assert(UniformsPerVF != Uniforms.end() &&
1357            "VF not yet analyzed for uniformity");
1358     return UniformsPerVF->second.count(I);
1359   }
1360 
1361   /// Returns true if \p I is known to be scalar after vectorization.
1362   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1363     if (VF.isScalar())
1364       return true;
1365 
1366     // Cost model is not run in the VPlan-native path - return conservative
1367     // result until this changes.
1368     if (EnableVPlanNativePath)
1369       return false;
1370 
1371     auto ScalarsPerVF = Scalars.find(VF);
1372     assert(ScalarsPerVF != Scalars.end() &&
1373            "Scalar values are not calculated for VF");
1374     return ScalarsPerVF->second.count(I);
1375   }
1376 
1377   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1378   /// for vectorization factor \p VF.
1379   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1380     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1381            !isProfitableToScalarize(I, VF) &&
1382            !isScalarAfterVectorization(I, VF);
1383   }
1384 
1385   /// Decision that was taken during cost calculation for memory instruction.
1386   enum InstWidening {
1387     CM_Unknown,
1388     CM_Widen,         // For consecutive accesses with stride +1.
1389     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1390     CM_Interleave,
1391     CM_GatherScatter,
1392     CM_Scalarize
1393   };
1394 
1395   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1396   /// instruction \p I and vector width \p VF.
1397   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1398                            InstructionCost Cost) {
1399     assert(VF.isVector() && "Expected VF >=2");
1400     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1401   }
1402 
1403   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1404   /// interleaving group \p Grp and vector width \p VF.
1405   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1406                            ElementCount VF, InstWidening W,
1407                            InstructionCost Cost) {
1408     assert(VF.isVector() && "Expected VF >=2");
1409     /// Broadcast this decicion to all instructions inside the group.
1410     /// But the cost will be assigned to one instruction only.
1411     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1412       if (auto *I = Grp->getMember(i)) {
1413         if (Grp->getInsertPos() == I)
1414           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1415         else
1416           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1417       }
1418     }
1419   }
1420 
1421   /// Return the cost model decision for the given instruction \p I and vector
1422   /// width \p VF. Return CM_Unknown if this instruction did not pass
1423   /// through the cost modeling.
1424   InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1425     assert(VF.isVector() && "Expected VF to be a vector VF");
1426     // Cost model is not run in the VPlan-native path - return conservative
1427     // result until this changes.
1428     if (EnableVPlanNativePath)
1429       return CM_GatherScatter;
1430 
1431     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1432     auto Itr = WideningDecisions.find(InstOnVF);
1433     if (Itr == WideningDecisions.end())
1434       return CM_Unknown;
1435     return Itr->second.first;
1436   }
1437 
1438   /// Return the vectorization cost for the given instruction \p I and vector
1439   /// width \p VF.
1440   InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1441     assert(VF.isVector() && "Expected VF >=2");
1442     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1443     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1444            "The cost is not calculated");
1445     return WideningDecisions[InstOnVF].second;
1446   }
1447 
1448   /// Return True if instruction \p I is an optimizable truncate whose operand
1449   /// is an induction variable. Such a truncate will be removed by adding a new
1450   /// induction variable with the destination type.
1451   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1452     // If the instruction is not a truncate, return false.
1453     auto *Trunc = dyn_cast<TruncInst>(I);
1454     if (!Trunc)
1455       return false;
1456 
1457     // Get the source and destination types of the truncate.
1458     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1459     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1460 
1461     // If the truncate is free for the given types, return false. Replacing a
1462     // free truncate with an induction variable would add an induction variable
1463     // update instruction to each iteration of the loop. We exclude from this
1464     // check the primary induction variable since it will need an update
1465     // instruction regardless.
1466     Value *Op = Trunc->getOperand(0);
1467     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1468       return false;
1469 
1470     // If the truncated value is not an induction variable, return false.
1471     return Legal->isInductionPhi(Op);
1472   }
1473 
1474   /// Collects the instructions to scalarize for each predicated instruction in
1475   /// the loop.
1476   void collectInstsToScalarize(ElementCount VF);
1477 
1478   /// Collect Uniform and Scalar values for the given \p VF.
1479   /// The sets depend on CM decision for Load/Store instructions
1480   /// that may be vectorized as interleave, gather-scatter or scalarized.
1481   void collectUniformsAndScalars(ElementCount VF) {
1482     // Do the analysis once.
1483     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1484       return;
1485     setCostBasedWideningDecision(VF);
1486     collectLoopUniforms(VF);
1487     collectLoopScalars(VF);
1488   }
1489 
1490   /// Returns true if the target machine supports masked store operation
1491   /// for the given \p DataType and kind of access to \p Ptr.
1492   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1493     return Legal->isConsecutivePtr(Ptr) &&
1494            TTI.isLegalMaskedStore(DataType, Alignment);
1495   }
1496 
1497   /// Returns true if the target machine supports masked load operation
1498   /// for the given \p DataType and kind of access to \p Ptr.
1499   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1500     return Legal->isConsecutivePtr(Ptr) &&
1501            TTI.isLegalMaskedLoad(DataType, Alignment);
1502   }
1503 
1504   /// Returns true if the target machine can represent \p V as a masked gather
1505   /// or scatter operation.
1506   bool isLegalGatherOrScatter(Value *V) {
1507     bool LI = isa<LoadInst>(V);
1508     bool SI = isa<StoreInst>(V);
1509     if (!LI && !SI)
1510       return false;
1511     auto *Ty = getLoadStoreType(V);
1512     Align Align = getLoadStoreAlignment(V);
1513     return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1514            (SI && TTI.isLegalMaskedScatter(Ty, Align));
1515   }
1516 
1517   /// Returns true if the target machine supports all of the reduction
1518   /// variables found for the given VF.
1519   bool canVectorizeReductions(ElementCount VF) {
1520     return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1521       const RecurrenceDescriptor &RdxDesc = Reduction.second;
1522       return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1523     }));
1524   }
1525 
1526   /// Returns true if \p I is an instruction that will be scalarized with
1527   /// predication. Such instructions include conditional stores and
1528   /// instructions that may divide by zero.
1529   /// If a non-zero VF has been calculated, we check if I will be scalarized
1530   /// predication for that VF.
1531   bool isScalarWithPredication(Instruction *I) const;
1532 
1533   // Returns true if \p I is an instruction that will be predicated either
1534   // through scalar predication or masked load/store or masked gather/scatter.
1535   // Superset of instructions that return true for isScalarWithPredication.
1536   bool isPredicatedInst(Instruction *I) {
1537     if (!blockNeedsPredication(I->getParent()))
1538       return false;
1539     // Loads and stores that need some form of masked operation are predicated
1540     // instructions.
1541     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1542       return Legal->isMaskRequired(I);
1543     return isScalarWithPredication(I);
1544   }
1545 
1546   /// Returns true if \p I is a memory instruction with consecutive memory
1547   /// access that can be widened.
1548   bool
1549   memoryInstructionCanBeWidened(Instruction *I,
1550                                 ElementCount VF = ElementCount::getFixed(1));
1551 
1552   /// Returns true if \p I is a memory instruction in an interleaved-group
1553   /// of memory accesses that can be vectorized with wide vector loads/stores
1554   /// and shuffles.
1555   bool
1556   interleavedAccessCanBeWidened(Instruction *I,
1557                                 ElementCount VF = ElementCount::getFixed(1));
1558 
1559   /// Check if \p Instr belongs to any interleaved access group.
1560   bool isAccessInterleaved(Instruction *Instr) {
1561     return InterleaveInfo.isInterleaved(Instr);
1562   }
1563 
1564   /// Get the interleaved access group that \p Instr belongs to.
1565   const InterleaveGroup<Instruction> *
1566   getInterleavedAccessGroup(Instruction *Instr) {
1567     return InterleaveInfo.getInterleaveGroup(Instr);
1568   }
1569 
1570   /// Returns true if we're required to use a scalar epilogue for at least
1571   /// the final iteration of the original loop.
1572   bool requiresScalarEpilogue(ElementCount VF) const {
1573     if (!isScalarEpilogueAllowed())
1574       return false;
1575     // If we might exit from anywhere but the latch, must run the exiting
1576     // iteration in scalar form.
1577     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1578       return true;
1579     return VF.isVector() && InterleaveInfo.requiresScalarEpilogue();
1580   }
1581 
1582   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1583   /// loop hint annotation.
1584   bool isScalarEpilogueAllowed() const {
1585     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1586   }
1587 
1588   /// Returns true if all loop blocks should be masked to fold tail loop.
1589   bool foldTailByMasking() const { return FoldTailByMasking; }
1590 
1591   bool blockNeedsPredication(BasicBlock *BB) const {
1592     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1593   }
1594 
1595   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1596   /// nodes to the chain of instructions representing the reductions. Uses a
1597   /// MapVector to ensure deterministic iteration order.
1598   using ReductionChainMap =
1599       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1600 
1601   /// Return the chain of instructions representing an inloop reduction.
1602   const ReductionChainMap &getInLoopReductionChains() const {
1603     return InLoopReductionChains;
1604   }
1605 
1606   /// Returns true if the Phi is part of an inloop reduction.
1607   bool isInLoopReduction(PHINode *Phi) const {
1608     return InLoopReductionChains.count(Phi);
1609   }
1610 
1611   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1612   /// with factor VF.  Return the cost of the instruction, including
1613   /// scalarization overhead if it's needed.
1614   InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1615 
1616   /// Estimate cost of a call instruction CI if it were vectorized with factor
1617   /// VF. Return the cost of the instruction, including scalarization overhead
1618   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1619   /// scalarized -
1620   /// i.e. either vector version isn't available, or is too expensive.
1621   InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
1622                                     bool &NeedToScalarize) const;
1623 
1624   /// Returns true if the per-lane cost of VectorizationFactor A is lower than
1625   /// that of B.
1626   bool isMoreProfitable(const VectorizationFactor &A,
1627                         const VectorizationFactor &B) const;
1628 
1629   /// Invalidates decisions already taken by the cost model.
1630   void invalidateCostModelingDecisions() {
1631     WideningDecisions.clear();
1632     Uniforms.clear();
1633     Scalars.clear();
1634   }
1635 
1636 private:
1637   unsigned NumPredStores = 0;
1638 
1639   /// \return An upper bound for the vectorization factors for both
1640   /// fixed and scalable vectorization, where the minimum-known number of
1641   /// elements is a power-of-2 larger than zero. If scalable vectorization is
1642   /// disabled or unsupported, then the scalable part will be equal to
1643   /// ElementCount::getScalable(0).
1644   FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
1645                                            ElementCount UserVF);
1646 
1647   /// \return the maximized element count based on the targets vector
1648   /// registers and the loop trip-count, but limited to a maximum safe VF.
1649   /// This is a helper function of computeFeasibleMaxVF.
1650   /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure
1651   /// issue that occurred on one of the buildbots which cannot be reproduced
1652   /// without having access to the properietary compiler (see comments on
1653   /// D98509). The issue is currently under investigation and this workaround
1654   /// will be removed as soon as possible.
1655   ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
1656                                        unsigned SmallestType,
1657                                        unsigned WidestType,
1658                                        const ElementCount &MaxSafeVF);
1659 
1660   /// \return the maximum legal scalable VF, based on the safe max number
1661   /// of elements.
1662   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1663 
1664   /// The vectorization cost is a combination of the cost itself and a boolean
1665   /// indicating whether any of the contributing operations will actually
1666   /// operate on vector values after type legalization in the backend. If this
1667   /// latter value is false, then all operations will be scalarized (i.e. no
1668   /// vectorization has actually taken place).
1669   using VectorizationCostTy = std::pair<InstructionCost, bool>;
1670 
1671   /// Returns the expected execution cost. The unit of the cost does
1672   /// not matter because we use the 'cost' units to compare different
1673   /// vector widths. The cost that is returned is *not* normalized by
1674   /// the factor width.
1675   VectorizationCostTy expectedCost(ElementCount VF);
1676 
1677   /// Returns the execution time cost of an instruction for a given vector
1678   /// width. Vector width of one means scalar.
1679   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1680 
1681   /// The cost-computation logic from getInstructionCost which provides
1682   /// the vector type as an output parameter.
1683   InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1684                                      Type *&VectorTy);
1685 
1686   /// Return the cost of instructions in an inloop reduction pattern, if I is
1687   /// part of that pattern.
1688   InstructionCost getReductionPatternCost(Instruction *I, ElementCount VF,
1689                                           Type *VectorTy,
1690                                           TTI::TargetCostKind CostKind);
1691 
1692   /// Calculate vectorization cost of memory instruction \p I.
1693   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1694 
1695   /// The cost computation for scalarized memory instruction.
1696   InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1697 
1698   /// The cost computation for interleaving group of memory instructions.
1699   InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1700 
1701   /// The cost computation for Gather/Scatter instruction.
1702   InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1703 
1704   /// The cost computation for widening instruction \p I with consecutive
1705   /// memory access.
1706   InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1707 
1708   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1709   /// Load: scalar load + broadcast.
1710   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1711   /// element)
1712   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1713 
1714   /// Estimate the overhead of scalarizing an instruction. This is a
1715   /// convenience wrapper for the type-based getScalarizationOverhead API.
1716   InstructionCost getScalarizationOverhead(Instruction *I,
1717                                            ElementCount VF) const;
1718 
1719   /// Returns whether the instruction is a load or store and will be a emitted
1720   /// as a vector operation.
1721   bool isConsecutiveLoadOrStore(Instruction *I);
1722 
1723   /// Returns true if an artificially high cost for emulated masked memrefs
1724   /// should be used.
1725   bool useEmulatedMaskMemRefHack(Instruction *I);
1726 
1727   /// Map of scalar integer values to the smallest bitwidth they can be legally
1728   /// represented as. The vector equivalents of these values should be truncated
1729   /// to this type.
1730   MapVector<Instruction *, uint64_t> MinBWs;
1731 
1732   /// A type representing the costs for instructions if they were to be
1733   /// scalarized rather than vectorized. The entries are Instruction-Cost
1734   /// pairs.
1735   using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1736 
1737   /// A set containing all BasicBlocks that are known to present after
1738   /// vectorization as a predicated block.
1739   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1740 
1741   /// Records whether it is allowed to have the original scalar loop execute at
1742   /// least once. This may be needed as a fallback loop in case runtime
1743   /// aliasing/dependence checks fail, or to handle the tail/remainder
1744   /// iterations when the trip count is unknown or doesn't divide by the VF,
1745   /// or as a peel-loop to handle gaps in interleave-groups.
1746   /// Under optsize and when the trip count is very small we don't allow any
1747   /// iterations to execute in the scalar loop.
1748   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1749 
1750   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1751   bool FoldTailByMasking = false;
1752 
1753   /// A map holding scalar costs for different vectorization factors. The
1754   /// presence of a cost for an instruction in the mapping indicates that the
1755   /// instruction will be scalarized when vectorizing with the associated
1756   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1757   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1758 
1759   /// Holds the instructions known to be uniform after vectorization.
1760   /// The data is collected per VF.
1761   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1762 
1763   /// Holds the instructions known to be scalar after vectorization.
1764   /// The data is collected per VF.
1765   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1766 
1767   /// Holds the instructions (address computations) that are forced to be
1768   /// scalarized.
1769   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1770 
1771   /// PHINodes of the reductions that should be expanded in-loop along with
1772   /// their associated chains of reduction operations, in program order from top
1773   /// (PHI) to bottom
1774   ReductionChainMap InLoopReductionChains;
1775 
1776   /// A Map of inloop reduction operations and their immediate chain operand.
1777   /// FIXME: This can be removed once reductions can be costed correctly in
1778   /// vplan. This was added to allow quick lookup to the inloop operations,
1779   /// without having to loop through InLoopReductionChains.
1780   DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1781 
1782   /// Returns the expected difference in cost from scalarizing the expression
1783   /// feeding a predicated instruction \p PredInst. The instructions to
1784   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1785   /// non-negative return value implies the expression will be scalarized.
1786   /// Currently, only single-use chains are considered for scalarization.
1787   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1788                               ElementCount VF);
1789 
1790   /// Collect the instructions that are uniform after vectorization. An
1791   /// instruction is uniform if we represent it with a single scalar value in
1792   /// the vectorized loop corresponding to each vector iteration. Examples of
1793   /// uniform instructions include pointer operands of consecutive or
1794   /// interleaved memory accesses. Note that although uniformity implies an
1795   /// instruction will be scalar, the reverse is not true. In general, a
1796   /// scalarized instruction will be represented by VF scalar values in the
1797   /// vectorized loop, each corresponding to an iteration of the original
1798   /// scalar loop.
1799   void collectLoopUniforms(ElementCount VF);
1800 
1801   /// Collect the instructions that are scalar after vectorization. An
1802   /// instruction is scalar if it is known to be uniform or will be scalarized
1803   /// during vectorization. Non-uniform scalarized instructions will be
1804   /// represented by VF values in the vectorized loop, each corresponding to an
1805   /// iteration of the original scalar loop.
1806   void collectLoopScalars(ElementCount VF);
1807 
1808   /// Keeps cost model vectorization decision and cost for instructions.
1809   /// Right now it is used for memory instructions only.
1810   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1811                                 std::pair<InstWidening, InstructionCost>>;
1812 
1813   DecisionList WideningDecisions;
1814 
1815   /// Returns true if \p V is expected to be vectorized and it needs to be
1816   /// extracted.
1817   bool needsExtract(Value *V, ElementCount VF) const {
1818     Instruction *I = dyn_cast<Instruction>(V);
1819     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1820         TheLoop->isLoopInvariant(I))
1821       return false;
1822 
1823     // Assume we can vectorize V (and hence we need extraction) if the
1824     // scalars are not computed yet. This can happen, because it is called
1825     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1826     // the scalars are collected. That should be a safe assumption in most
1827     // cases, because we check if the operands have vectorizable types
1828     // beforehand in LoopVectorizationLegality.
1829     return Scalars.find(VF) == Scalars.end() ||
1830            !isScalarAfterVectorization(I, VF);
1831   };
1832 
1833   /// Returns a range containing only operands needing to be extracted.
1834   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1835                                                    ElementCount VF) const {
1836     return SmallVector<Value *, 4>(make_filter_range(
1837         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1838   }
1839 
1840   /// Determines if we have the infrastructure to vectorize loop \p L and its
1841   /// epilogue, assuming the main loop is vectorized by \p VF.
1842   bool isCandidateForEpilogueVectorization(const Loop &L,
1843                                            const ElementCount VF) const;
1844 
1845   /// Returns true if epilogue vectorization is considered profitable, and
1846   /// false otherwise.
1847   /// \p VF is the vectorization factor chosen for the original loop.
1848   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1849 
1850 public:
1851   /// The loop that we evaluate.
1852   Loop *TheLoop;
1853 
1854   /// Predicated scalar evolution analysis.
1855   PredicatedScalarEvolution &PSE;
1856 
1857   /// Loop Info analysis.
1858   LoopInfo *LI;
1859 
1860   /// Vectorization legality.
1861   LoopVectorizationLegality *Legal;
1862 
1863   /// Vector target information.
1864   const TargetTransformInfo &TTI;
1865 
1866   /// Target Library Info.
1867   const TargetLibraryInfo *TLI;
1868 
1869   /// Demanded bits analysis.
1870   DemandedBits *DB;
1871 
1872   /// Assumption cache.
1873   AssumptionCache *AC;
1874 
1875   /// Interface to emit optimization remarks.
1876   OptimizationRemarkEmitter *ORE;
1877 
1878   const Function *TheFunction;
1879 
1880   /// Loop Vectorize Hint.
1881   const LoopVectorizeHints *Hints;
1882 
1883   /// The interleave access information contains groups of interleaved accesses
1884   /// with the same stride and close to each other.
1885   InterleavedAccessInfo &InterleaveInfo;
1886 
1887   /// Values to ignore in the cost model.
1888   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1889 
1890   /// Values to ignore in the cost model when VF > 1.
1891   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1892 
1893   /// Profitable vector factors.
1894   SmallVector<VectorizationFactor, 8> ProfitableVFs;
1895 };
1896 } // end namespace llvm
1897 
1898 /// Helper struct to manage generating runtime checks for vectorization.
1899 ///
1900 /// The runtime checks are created up-front in temporary blocks to allow better
1901 /// estimating the cost and un-linked from the existing IR. After deciding to
1902 /// vectorize, the checks are moved back. If deciding not to vectorize, the
1903 /// temporary blocks are completely removed.
1904 class GeneratedRTChecks {
1905   /// Basic block which contains the generated SCEV checks, if any.
1906   BasicBlock *SCEVCheckBlock = nullptr;
1907 
1908   /// The value representing the result of the generated SCEV checks. If it is
1909   /// nullptr, either no SCEV checks have been generated or they have been used.
1910   Value *SCEVCheckCond = nullptr;
1911 
1912   /// Basic block which contains the generated memory runtime checks, if any.
1913   BasicBlock *MemCheckBlock = nullptr;
1914 
1915   /// The value representing the result of the generated memory runtime checks.
1916   /// If it is nullptr, either no memory runtime checks have been generated or
1917   /// they have been used.
1918   Instruction *MemRuntimeCheckCond = nullptr;
1919 
1920   DominatorTree *DT;
1921   LoopInfo *LI;
1922 
1923   SCEVExpander SCEVExp;
1924   SCEVExpander MemCheckExp;
1925 
1926 public:
1927   GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1928                     const DataLayout &DL)
1929       : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"),
1930         MemCheckExp(SE, DL, "scev.check") {}
1931 
1932   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1933   /// accurately estimate the cost of the runtime checks. The blocks are
1934   /// un-linked from the IR and is added back during vector code generation. If
1935   /// there is no vector code generation, the check blocks are removed
1936   /// completely.
1937   void Create(Loop *L, const LoopAccessInfo &LAI,
1938               const SCEVUnionPredicate &UnionPred) {
1939 
1940     BasicBlock *LoopHeader = L->getHeader();
1941     BasicBlock *Preheader = L->getLoopPreheader();
1942 
1943     // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1944     // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1945     // may be used by SCEVExpander. The blocks will be un-linked from their
1946     // predecessors and removed from LI & DT at the end of the function.
1947     if (!UnionPred.isAlwaysTrue()) {
1948       SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1949                                   nullptr, "vector.scevcheck");
1950 
1951       SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1952           &UnionPred, SCEVCheckBlock->getTerminator());
1953     }
1954 
1955     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1956     if (RtPtrChecking.Need) {
1957       auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1958       MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1959                                  "vector.memcheck");
1960 
1961       std::tie(std::ignore, MemRuntimeCheckCond) =
1962           addRuntimeChecks(MemCheckBlock->getTerminator(), L,
1963                            RtPtrChecking.getChecks(), MemCheckExp);
1964       assert(MemRuntimeCheckCond &&
1965              "no RT checks generated although RtPtrChecking "
1966              "claimed checks are required");
1967     }
1968 
1969     if (!MemCheckBlock && !SCEVCheckBlock)
1970       return;
1971 
1972     // Unhook the temporary block with the checks, update various places
1973     // accordingly.
1974     if (SCEVCheckBlock)
1975       SCEVCheckBlock->replaceAllUsesWith(Preheader);
1976     if (MemCheckBlock)
1977       MemCheckBlock->replaceAllUsesWith(Preheader);
1978 
1979     if (SCEVCheckBlock) {
1980       SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1981       new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1982       Preheader->getTerminator()->eraseFromParent();
1983     }
1984     if (MemCheckBlock) {
1985       MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1986       new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1987       Preheader->getTerminator()->eraseFromParent();
1988     }
1989 
1990     DT->changeImmediateDominator(LoopHeader, Preheader);
1991     if (MemCheckBlock) {
1992       DT->eraseNode(MemCheckBlock);
1993       LI->removeBlock(MemCheckBlock);
1994     }
1995     if (SCEVCheckBlock) {
1996       DT->eraseNode(SCEVCheckBlock);
1997       LI->removeBlock(SCEVCheckBlock);
1998     }
1999   }
2000 
2001   /// Remove the created SCEV & memory runtime check blocks & instructions, if
2002   /// unused.
2003   ~GeneratedRTChecks() {
2004     SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT);
2005     SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT);
2006     if (!SCEVCheckCond)
2007       SCEVCleaner.markResultUsed();
2008 
2009     if (!MemRuntimeCheckCond)
2010       MemCheckCleaner.markResultUsed();
2011 
2012     if (MemRuntimeCheckCond) {
2013       auto &SE = *MemCheckExp.getSE();
2014       // Memory runtime check generation creates compares that use expanded
2015       // values. Remove them before running the SCEVExpanderCleaners.
2016       for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2017         if (MemCheckExp.isInsertedInstruction(&I))
2018           continue;
2019         SE.forgetValue(&I);
2020         SE.eraseValueFromMap(&I);
2021         I.eraseFromParent();
2022       }
2023     }
2024     MemCheckCleaner.cleanup();
2025     SCEVCleaner.cleanup();
2026 
2027     if (SCEVCheckCond)
2028       SCEVCheckBlock->eraseFromParent();
2029     if (MemRuntimeCheckCond)
2030       MemCheckBlock->eraseFromParent();
2031   }
2032 
2033   /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2034   /// adjusts the branches to branch to the vector preheader or \p Bypass,
2035   /// depending on the generated condition.
2036   BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass,
2037                              BasicBlock *LoopVectorPreHeader,
2038                              BasicBlock *LoopExitBlock) {
2039     if (!SCEVCheckCond)
2040       return nullptr;
2041     if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond))
2042       if (C->isZero())
2043         return nullptr;
2044 
2045     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2046 
2047     BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2048     // Create new preheader for vector loop.
2049     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2050       PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2051 
2052     SCEVCheckBlock->getTerminator()->eraseFromParent();
2053     SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2054     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2055                                                 SCEVCheckBlock);
2056 
2057     DT->addNewBlock(SCEVCheckBlock, Pred);
2058     DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2059 
2060     ReplaceInstWithInst(
2061         SCEVCheckBlock->getTerminator(),
2062         BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond));
2063     // Mark the check as used, to prevent it from being removed during cleanup.
2064     SCEVCheckCond = nullptr;
2065     return SCEVCheckBlock;
2066   }
2067 
2068   /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2069   /// the branches to branch to the vector preheader or \p Bypass, depending on
2070   /// the generated condition.
2071   BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass,
2072                                    BasicBlock *LoopVectorPreHeader) {
2073     // Check if we generated code that checks in runtime if arrays overlap.
2074     if (!MemRuntimeCheckCond)
2075       return nullptr;
2076 
2077     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2078     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2079                                                 MemCheckBlock);
2080 
2081     DT->addNewBlock(MemCheckBlock, Pred);
2082     DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2083     MemCheckBlock->moveBefore(LoopVectorPreHeader);
2084 
2085     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2086       PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2087 
2088     ReplaceInstWithInst(
2089         MemCheckBlock->getTerminator(),
2090         BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
2091     MemCheckBlock->getTerminator()->setDebugLoc(
2092         Pred->getTerminator()->getDebugLoc());
2093 
2094     // Mark the check as used, to prevent it from being removed during cleanup.
2095     MemRuntimeCheckCond = nullptr;
2096     return MemCheckBlock;
2097   }
2098 };
2099 
2100 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2101 // vectorization. The loop needs to be annotated with #pragma omp simd
2102 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2103 // vector length information is not provided, vectorization is not considered
2104 // explicit. Interleave hints are not allowed either. These limitations will be
2105 // relaxed in the future.
2106 // Please, note that we are currently forced to abuse the pragma 'clang
2107 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2108 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2109 // provides *explicit vectorization hints* (LV can bypass legal checks and
2110 // assume that vectorization is legal). However, both hints are implemented
2111 // using the same metadata (llvm.loop.vectorize, processed by
2112 // LoopVectorizeHints). This will be fixed in the future when the native IR
2113 // representation for pragma 'omp simd' is introduced.
2114 static bool isExplicitVecOuterLoop(Loop *OuterLp,
2115                                    OptimizationRemarkEmitter *ORE) {
2116   assert(!OuterLp->isInnermost() && "This is not an outer loop");
2117   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2118 
2119   // Only outer loops with an explicit vectorization hint are supported.
2120   // Unannotated outer loops are ignored.
2121   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2122     return false;
2123 
2124   Function *Fn = OuterLp->getHeader()->getParent();
2125   if (!Hints.allowVectorization(Fn, OuterLp,
2126                                 true /*VectorizeOnlyWhenForced*/)) {
2127     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2128     return false;
2129   }
2130 
2131   if (Hints.getInterleave() > 1) {
2132     // TODO: Interleave support is future work.
2133     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2134                          "outer loops.\n");
2135     Hints.emitRemarkWithHints();
2136     return false;
2137   }
2138 
2139   return true;
2140 }
2141 
2142 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2143                                   OptimizationRemarkEmitter *ORE,
2144                                   SmallVectorImpl<Loop *> &V) {
2145   // Collect inner loops and outer loops without irreducible control flow. For
2146   // now, only collect outer loops that have explicit vectorization hints. If we
2147   // are stress testing the VPlan H-CFG construction, we collect the outermost
2148   // loop of every loop nest.
2149   if (L.isInnermost() || VPlanBuildStressTest ||
2150       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2151     LoopBlocksRPO RPOT(&L);
2152     RPOT.perform(LI);
2153     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2154       V.push_back(&L);
2155       // TODO: Collect inner loops inside marked outer loops in case
2156       // vectorization fails for the outer loop. Do not invoke
2157       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2158       // already known to be reducible. We can use an inherited attribute for
2159       // that.
2160       return;
2161     }
2162   }
2163   for (Loop *InnerL : L)
2164     collectSupportedLoops(*InnerL, LI, ORE, V);
2165 }
2166 
2167 namespace {
2168 
2169 /// The LoopVectorize Pass.
2170 struct LoopVectorize : public FunctionPass {
2171   /// Pass identification, replacement for typeid
2172   static char ID;
2173 
2174   LoopVectorizePass Impl;
2175 
2176   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
2177                          bool VectorizeOnlyWhenForced = false)
2178       : FunctionPass(ID),
2179         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
2180     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
2181   }
2182 
2183   bool runOnFunction(Function &F) override {
2184     if (skipFunction(F))
2185       return false;
2186 
2187     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2188     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2189     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2190     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2191     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2192     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2193     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
2194     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2195     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2196     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
2197     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2198     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2199     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
2200 
2201     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
2202         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
2203 
2204     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
2205                         GetLAA, *ORE, PSI).MadeAnyChange;
2206   }
2207 
2208   void getAnalysisUsage(AnalysisUsage &AU) const override {
2209     AU.addRequired<AssumptionCacheTracker>();
2210     AU.addRequired<BlockFrequencyInfoWrapperPass>();
2211     AU.addRequired<DominatorTreeWrapperPass>();
2212     AU.addRequired<LoopInfoWrapperPass>();
2213     AU.addRequired<ScalarEvolutionWrapperPass>();
2214     AU.addRequired<TargetTransformInfoWrapperPass>();
2215     AU.addRequired<AAResultsWrapperPass>();
2216     AU.addRequired<LoopAccessLegacyAnalysis>();
2217     AU.addRequired<DemandedBitsWrapperPass>();
2218     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
2219     AU.addRequired<InjectTLIMappingsLegacy>();
2220 
2221     // We currently do not preserve loopinfo/dominator analyses with outer loop
2222     // vectorization. Until this is addressed, mark these analyses as preserved
2223     // only for non-VPlan-native path.
2224     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
2225     if (!EnableVPlanNativePath) {
2226       AU.addPreserved<LoopInfoWrapperPass>();
2227       AU.addPreserved<DominatorTreeWrapperPass>();
2228     }
2229 
2230     AU.addPreserved<BasicAAWrapperPass>();
2231     AU.addPreserved<GlobalsAAWrapperPass>();
2232     AU.addRequired<ProfileSummaryInfoWrapperPass>();
2233   }
2234 };
2235 
2236 } // end anonymous namespace
2237 
2238 //===----------------------------------------------------------------------===//
2239 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2240 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2241 //===----------------------------------------------------------------------===//
2242 
2243 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2244   // We need to place the broadcast of invariant variables outside the loop,
2245   // but only if it's proven safe to do so. Else, broadcast will be inside
2246   // vector loop body.
2247   Instruction *Instr = dyn_cast<Instruction>(V);
2248   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
2249                      (!Instr ||
2250                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
2251   // Place the code for broadcasting invariant variables in the new preheader.
2252   IRBuilder<>::InsertPointGuard Guard(Builder);
2253   if (SafeToHoist)
2254     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2255 
2256   // Broadcast the scalar into all locations in the vector.
2257   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2258 
2259   return Shuf;
2260 }
2261 
2262 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
2263     const InductionDescriptor &II, Value *Step, Value *Start,
2264     Instruction *EntryVal, VPValue *Def, VPValue *CastDef,
2265     VPTransformState &State) {
2266   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2267          "Expected either an induction phi-node or a truncate of it!");
2268 
2269   // Construct the initial value of the vector IV in the vector loop preheader
2270   auto CurrIP = Builder.saveIP();
2271   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2272   if (isa<TruncInst>(EntryVal)) {
2273     assert(Start->getType()->isIntegerTy() &&
2274            "Truncation requires an integer type");
2275     auto *TruncType = cast<IntegerType>(EntryVal->getType());
2276     Step = Builder.CreateTrunc(Step, TruncType);
2277     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
2278   }
2279   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
2280   Value *SteppedStart =
2281       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
2282 
2283   // We create vector phi nodes for both integer and floating-point induction
2284   // variables. Here, we determine the kind of arithmetic we will perform.
2285   Instruction::BinaryOps AddOp;
2286   Instruction::BinaryOps MulOp;
2287   if (Step->getType()->isIntegerTy()) {
2288     AddOp = Instruction::Add;
2289     MulOp = Instruction::Mul;
2290   } else {
2291     AddOp = II.getInductionOpcode();
2292     MulOp = Instruction::FMul;
2293   }
2294 
2295   // Multiply the vectorization factor by the step using integer or
2296   // floating-point arithmetic as appropriate.
2297   Type *StepType = Step->getType();
2298   if (Step->getType()->isFloatingPointTy())
2299     StepType = IntegerType::get(StepType->getContext(),
2300                                 StepType->getScalarSizeInBits());
2301   Value *RuntimeVF = getRuntimeVF(Builder, StepType, VF);
2302   if (Step->getType()->isFloatingPointTy())
2303     RuntimeVF = Builder.CreateSIToFP(RuntimeVF, Step->getType());
2304   Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
2305 
2306   // Create a vector splat to use in the induction update.
2307   //
2308   // FIXME: If the step is non-constant, we create the vector splat with
2309   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
2310   //        handle a constant vector splat.
2311   Value *SplatVF = isa<Constant>(Mul)
2312                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
2313                        : Builder.CreateVectorSplat(VF, Mul);
2314   Builder.restoreIP(CurrIP);
2315 
2316   // We may need to add the step a number of times, depending on the unroll
2317   // factor. The last of those goes into the PHI.
2318   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
2319                                     &*LoopVectorBody->getFirstInsertionPt());
2320   VecInd->setDebugLoc(EntryVal->getDebugLoc());
2321   Instruction *LastInduction = VecInd;
2322   for (unsigned Part = 0; Part < UF; ++Part) {
2323     State.set(Def, LastInduction, Part);
2324 
2325     if (isa<TruncInst>(EntryVal))
2326       addMetadata(LastInduction, EntryVal);
2327     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef,
2328                                           State, Part);
2329 
2330     LastInduction = cast<Instruction>(
2331         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
2332     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
2333   }
2334 
2335   // Move the last step to the end of the latch block. This ensures consistent
2336   // placement of all induction updates.
2337   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
2338   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
2339   auto *ICmp = cast<Instruction>(Br->getCondition());
2340   LastInduction->moveBefore(ICmp);
2341   LastInduction->setName("vec.ind.next");
2342 
2343   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
2344   VecInd->addIncoming(LastInduction, LoopVectorLatch);
2345 }
2346 
2347 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
2348   return Cost->isScalarAfterVectorization(I, VF) ||
2349          Cost->isProfitableToScalarize(I, VF);
2350 }
2351 
2352 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
2353   if (shouldScalarizeInstruction(IV))
2354     return true;
2355   auto isScalarInst = [&](User *U) -> bool {
2356     auto *I = cast<Instruction>(U);
2357     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
2358   };
2359   return llvm::any_of(IV->users(), isScalarInst);
2360 }
2361 
2362 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
2363     const InductionDescriptor &ID, const Instruction *EntryVal,
2364     Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State,
2365     unsigned Part, unsigned Lane) {
2366   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2367          "Expected either an induction phi-node or a truncate of it!");
2368 
2369   // This induction variable is not the phi from the original loop but the
2370   // newly-created IV based on the proof that casted Phi is equal to the
2371   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
2372   // re-uses the same InductionDescriptor that original IV uses but we don't
2373   // have to do any recording in this case - that is done when original IV is
2374   // processed.
2375   if (isa<TruncInst>(EntryVal))
2376     return;
2377 
2378   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
2379   if (Casts.empty())
2380     return;
2381   // Only the first Cast instruction in the Casts vector is of interest.
2382   // The rest of the Casts (if exist) have no uses outside the
2383   // induction update chain itself.
2384   if (Lane < UINT_MAX)
2385     State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane));
2386   else
2387     State.set(CastDef, VectorLoopVal, Part);
2388 }
2389 
2390 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
2391                                                 TruncInst *Trunc, VPValue *Def,
2392                                                 VPValue *CastDef,
2393                                                 VPTransformState &State) {
2394   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
2395          "Primary induction variable must have an integer type");
2396 
2397   auto II = Legal->getInductionVars().find(IV);
2398   assert(II != Legal->getInductionVars().end() && "IV is not an induction");
2399 
2400   auto ID = II->second;
2401   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
2402 
2403   // The value from the original loop to which we are mapping the new induction
2404   // variable.
2405   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
2406 
2407   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2408 
2409   // Generate code for the induction step. Note that induction steps are
2410   // required to be loop-invariant
2411   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
2412     assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
2413            "Induction step should be loop invariant");
2414     if (PSE.getSE()->isSCEVable(IV->getType())) {
2415       SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2416       return Exp.expandCodeFor(Step, Step->getType(),
2417                                LoopVectorPreHeader->getTerminator());
2418     }
2419     return cast<SCEVUnknown>(Step)->getValue();
2420   };
2421 
2422   // The scalar value to broadcast. This is derived from the canonical
2423   // induction variable. If a truncation type is given, truncate the canonical
2424   // induction variable and step. Otherwise, derive these values from the
2425   // induction descriptor.
2426   auto CreateScalarIV = [&](Value *&Step) -> Value * {
2427     Value *ScalarIV = Induction;
2428     if (IV != OldInduction) {
2429       ScalarIV = IV->getType()->isIntegerTy()
2430                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
2431                      : Builder.CreateCast(Instruction::SIToFP, Induction,
2432                                           IV->getType());
2433       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
2434       ScalarIV->setName("offset.idx");
2435     }
2436     if (Trunc) {
2437       auto *TruncType = cast<IntegerType>(Trunc->getType());
2438       assert(Step->getType()->isIntegerTy() &&
2439              "Truncation requires an integer step");
2440       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
2441       Step = Builder.CreateTrunc(Step, TruncType);
2442     }
2443     return ScalarIV;
2444   };
2445 
2446   // Create the vector values from the scalar IV, in the absence of creating a
2447   // vector IV.
2448   auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
2449     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
2450     for (unsigned Part = 0; Part < UF; ++Part) {
2451       assert(!VF.isScalable() && "scalable vectors not yet supported.");
2452       Value *EntryPart =
2453           getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step,
2454                         ID.getInductionOpcode());
2455       State.set(Def, EntryPart, Part);
2456       if (Trunc)
2457         addMetadata(EntryPart, Trunc);
2458       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef,
2459                                             State, Part);
2460     }
2461   };
2462 
2463   // Fast-math-flags propagate from the original induction instruction.
2464   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
2465   if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
2466     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
2467 
2468   // Now do the actual transformations, and start with creating the step value.
2469   Value *Step = CreateStepValue(ID.getStep());
2470   if (VF.isZero() || VF.isScalar()) {
2471     Value *ScalarIV = CreateScalarIV(Step);
2472     CreateSplatIV(ScalarIV, Step);
2473     return;
2474   }
2475 
2476   // Determine if we want a scalar version of the induction variable. This is
2477   // true if the induction variable itself is not widened, or if it has at
2478   // least one user in the loop that is not widened.
2479   auto NeedsScalarIV = needsScalarInduction(EntryVal);
2480   if (!NeedsScalarIV) {
2481     createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef,
2482                                     State);
2483     return;
2484   }
2485 
2486   // Try to create a new independent vector induction variable. If we can't
2487   // create the phi node, we will splat the scalar induction variable in each
2488   // loop iteration.
2489   if (!shouldScalarizeInstruction(EntryVal)) {
2490     createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef,
2491                                     State);
2492     Value *ScalarIV = CreateScalarIV(Step);
2493     // Create scalar steps that can be used by instructions we will later
2494     // scalarize. Note that the addition of the scalar steps will not increase
2495     // the number of instructions in the loop in the common case prior to
2496     // InstCombine. We will be trading one vector extract for each scalar step.
2497     buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);
2498     return;
2499   }
2500 
2501   // All IV users are scalar instructions, so only emit a scalar IV, not a
2502   // vectorised IV. Except when we tail-fold, then the splat IV feeds the
2503   // predicate used by the masked loads/stores.
2504   Value *ScalarIV = CreateScalarIV(Step);
2505   if (!Cost->isScalarEpilogueAllowed())
2506     CreateSplatIV(ScalarIV, Step);
2507   buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);
2508 }
2509 
2510 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
2511                                           Instruction::BinaryOps BinOp) {
2512   // Create and check the types.
2513   auto *ValVTy = cast<VectorType>(Val->getType());
2514   ElementCount VLen = ValVTy->getElementCount();
2515 
2516   Type *STy = Val->getType()->getScalarType();
2517   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2518          "Induction Step must be an integer or FP");
2519   assert(Step->getType() == STy && "Step has wrong type");
2520 
2521   SmallVector<Constant *, 8> Indices;
2522 
2523   // Create a vector of consecutive numbers from zero to VF.
2524   VectorType *InitVecValVTy = ValVTy;
2525   Type *InitVecValSTy = STy;
2526   if (STy->isFloatingPointTy()) {
2527     InitVecValSTy =
2528         IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
2529     InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
2530   }
2531   Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
2532 
2533   // Add on StartIdx
2534   Value *StartIdxSplat = Builder.CreateVectorSplat(
2535       VLen, ConstantInt::get(InitVecValSTy, StartIdx));
2536   InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
2537 
2538   if (STy->isIntegerTy()) {
2539     Step = Builder.CreateVectorSplat(VLen, Step);
2540     assert(Step->getType() == Val->getType() && "Invalid step vec");
2541     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2542     // which can be found from the original scalar operations.
2543     Step = Builder.CreateMul(InitVec, Step);
2544     return Builder.CreateAdd(Val, Step, "induction");
2545   }
2546 
2547   // Floating point induction.
2548   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2549          "Binary Opcode should be specified for FP induction");
2550   InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
2551   Step = Builder.CreateVectorSplat(VLen, Step);
2552   Value *MulOp = Builder.CreateFMul(InitVec, Step);
2553   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2554 }
2555 
2556 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2557                                            Instruction *EntryVal,
2558                                            const InductionDescriptor &ID,
2559                                            VPValue *Def, VPValue *CastDef,
2560                                            VPTransformState &State) {
2561   // We shouldn't have to build scalar steps if we aren't vectorizing.
2562   assert(VF.isVector() && "VF should be greater than one");
2563   // Get the value type and ensure it and the step have the same integer type.
2564   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2565   assert(ScalarIVTy == Step->getType() &&
2566          "Val and Step should have the same type");
2567 
2568   // We build scalar steps for both integer and floating-point induction
2569   // variables. Here, we determine the kind of arithmetic we will perform.
2570   Instruction::BinaryOps AddOp;
2571   Instruction::BinaryOps MulOp;
2572   if (ScalarIVTy->isIntegerTy()) {
2573     AddOp = Instruction::Add;
2574     MulOp = Instruction::Mul;
2575   } else {
2576     AddOp = ID.getInductionOpcode();
2577     MulOp = Instruction::FMul;
2578   }
2579 
2580   // Determine the number of scalars we need to generate for each unroll
2581   // iteration. If EntryVal is uniform, we only need to generate the first
2582   // lane. Otherwise, we generate all VF values.
2583   bool IsUniform =
2584       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF);
2585   unsigned Lanes = IsUniform ? 1 : VF.getKnownMinValue();
2586   // Compute the scalar steps and save the results in State.
2587   Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2588                                      ScalarIVTy->getScalarSizeInBits());
2589   Type *VecIVTy = nullptr;
2590   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2591   if (!IsUniform && VF.isScalable()) {
2592     VecIVTy = VectorType::get(ScalarIVTy, VF);
2593     UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, VF));
2594     SplatStep = Builder.CreateVectorSplat(VF, Step);
2595     SplatIV = Builder.CreateVectorSplat(VF, ScalarIV);
2596   }
2597 
2598   for (unsigned Part = 0; Part < UF; ++Part) {
2599     Value *StartIdx0 =
2600         createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF);
2601 
2602     if (!IsUniform && VF.isScalable()) {
2603       auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0);
2604       auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2605       if (ScalarIVTy->isFloatingPointTy())
2606         InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2607       auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2608       auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2609       State.set(Def, Add, Part);
2610       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State,
2611                                             Part);
2612       // It's useful to record the lane values too for the known minimum number
2613       // of elements so we do those below. This improves the code quality when
2614       // trying to extract the first element, for example.
2615     }
2616 
2617     if (ScalarIVTy->isFloatingPointTy())
2618       StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);
2619 
2620     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2621       Value *StartIdx = Builder.CreateBinOp(
2622           AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
2623       // The step returned by `createStepForVF` is a runtime-evaluated value
2624       // when VF is scalable. Otherwise, it should be folded into a Constant.
2625       assert((VF.isScalable() || isa<Constant>(StartIdx)) &&
2626              "Expected StartIdx to be folded to a constant when VF is not "
2627              "scalable");
2628       auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2629       auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
2630       State.set(Def, Add, VPIteration(Part, Lane));
2631       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State,
2632                                             Part, Lane);
2633     }
2634   }
2635 }
2636 
2637 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
2638                                                     const VPIteration &Instance,
2639                                                     VPTransformState &State) {
2640   Value *ScalarInst = State.get(Def, Instance);
2641   Value *VectorValue = State.get(Def, Instance.Part);
2642   VectorValue = Builder.CreateInsertElement(
2643       VectorValue, ScalarInst,
2644       Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
2645   State.set(Def, VectorValue, Instance.Part);
2646 }
2647 
2648 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2649   assert(Vec->getType()->isVectorTy() && "Invalid type");
2650   return Builder.CreateVectorReverse(Vec, "reverse");
2651 }
2652 
2653 // Return whether we allow using masked interleave-groups (for dealing with
2654 // strided loads/stores that reside in predicated blocks, or for dealing
2655 // with gaps).
2656 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2657   // If an override option has been passed in for interleaved accesses, use it.
2658   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2659     return EnableMaskedInterleavedMemAccesses;
2660 
2661   return TTI.enableMaskedInterleavedAccessVectorization();
2662 }
2663 
2664 // Try to vectorize the interleave group that \p Instr belongs to.
2665 //
2666 // E.g. Translate following interleaved load group (factor = 3):
2667 //   for (i = 0; i < N; i+=3) {
2668 //     R = Pic[i];             // Member of index 0
2669 //     G = Pic[i+1];           // Member of index 1
2670 //     B = Pic[i+2];           // Member of index 2
2671 //     ... // do something to R, G, B
2672 //   }
2673 // To:
2674 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2675 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2676 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2677 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
2678 //
2679 // Or translate following interleaved store group (factor = 3):
2680 //   for (i = 0; i < N; i+=3) {
2681 //     ... do something to R, G, B
2682 //     Pic[i]   = R;           // Member of index 0
2683 //     Pic[i+1] = G;           // Member of index 1
2684 //     Pic[i+2] = B;           // Member of index 2
2685 //   }
2686 // To:
2687 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2688 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2689 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2690 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2691 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2692 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2693     const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2694     VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2695     VPValue *BlockInMask) {
2696   Instruction *Instr = Group->getInsertPos();
2697   const DataLayout &DL = Instr->getModule()->getDataLayout();
2698 
2699   // Prepare for the vector type of the interleaved load/store.
2700   Type *ScalarTy = getLoadStoreType(Instr);
2701   unsigned InterleaveFactor = Group->getFactor();
2702   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2703   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2704 
2705   // Prepare for the new pointers.
2706   SmallVector<Value *, 2> AddrParts;
2707   unsigned Index = Group->getIndex(Instr);
2708 
2709   // TODO: extend the masked interleaved-group support to reversed access.
2710   assert((!BlockInMask || !Group->isReverse()) &&
2711          "Reversed masked interleave-group not supported.");
2712 
2713   // If the group is reverse, adjust the index to refer to the last vector lane
2714   // instead of the first. We adjust the index from the first vector lane,
2715   // rather than directly getting the pointer for lane VF - 1, because the
2716   // pointer operand of the interleaved access is supposed to be uniform. For
2717   // uniform instructions, we're only required to generate a value for the
2718   // first vector lane in each unroll iteration.
2719   if (Group->isReverse())
2720     Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2721 
2722   for (unsigned Part = 0; Part < UF; Part++) {
2723     Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2724     setDebugLocFromInst(AddrPart);
2725 
2726     // Notice current instruction could be any index. Need to adjust the address
2727     // to the member of index 0.
2728     //
2729     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2730     //       b = A[i];       // Member of index 0
2731     // Current pointer is pointed to A[i+1], adjust it to A[i].
2732     //
2733     // E.g.  A[i+1] = a;     // Member of index 1
2734     //       A[i]   = b;     // Member of index 0
2735     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2736     // Current pointer is pointed to A[i+2], adjust it to A[i].
2737 
2738     bool InBounds = false;
2739     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2740       InBounds = gep->isInBounds();
2741     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2742     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2743 
2744     // Cast to the vector pointer type.
2745     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2746     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2747     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2748   }
2749 
2750   setDebugLocFromInst(Instr);
2751   Value *PoisonVec = PoisonValue::get(VecTy);
2752 
2753   Value *MaskForGaps = nullptr;
2754   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2755     MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2756     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2757   }
2758 
2759   // Vectorize the interleaved load group.
2760   if (isa<LoadInst>(Instr)) {
2761     // For each unroll part, create a wide load for the group.
2762     SmallVector<Value *, 2> NewLoads;
2763     for (unsigned Part = 0; Part < UF; Part++) {
2764       Instruction *NewLoad;
2765       if (BlockInMask || MaskForGaps) {
2766         assert(useMaskedInterleavedAccesses(*TTI) &&
2767                "masked interleaved groups are not allowed.");
2768         Value *GroupMask = MaskForGaps;
2769         if (BlockInMask) {
2770           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2771           Value *ShuffledMask = Builder.CreateShuffleVector(
2772               BlockInMaskPart,
2773               createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2774               "interleaved.mask");
2775           GroupMask = MaskForGaps
2776                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2777                                                 MaskForGaps)
2778                           : ShuffledMask;
2779         }
2780         NewLoad =
2781             Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
2782                                      GroupMask, PoisonVec, "wide.masked.vec");
2783       }
2784       else
2785         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2786                                             Group->getAlign(), "wide.vec");
2787       Group->addMetadata(NewLoad);
2788       NewLoads.push_back(NewLoad);
2789     }
2790 
2791     // For each member in the group, shuffle out the appropriate data from the
2792     // wide loads.
2793     unsigned J = 0;
2794     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2795       Instruction *Member = Group->getMember(I);
2796 
2797       // Skip the gaps in the group.
2798       if (!Member)
2799         continue;
2800 
2801       auto StrideMask =
2802           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2803       for (unsigned Part = 0; Part < UF; Part++) {
2804         Value *StridedVec = Builder.CreateShuffleVector(
2805             NewLoads[Part], StrideMask, "strided.vec");
2806 
2807         // If this member has different type, cast the result type.
2808         if (Member->getType() != ScalarTy) {
2809           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2810           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2811           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2812         }
2813 
2814         if (Group->isReverse())
2815           StridedVec = reverseVector(StridedVec);
2816 
2817         State.set(VPDefs[J], StridedVec, Part);
2818       }
2819       ++J;
2820     }
2821     return;
2822   }
2823 
2824   // The sub vector type for current instruction.
2825   auto *SubVT = VectorType::get(ScalarTy, VF);
2826 
2827   // Vectorize the interleaved store group.
2828   for (unsigned Part = 0; Part < UF; Part++) {
2829     // Collect the stored vector from each member.
2830     SmallVector<Value *, 4> StoredVecs;
2831     for (unsigned i = 0; i < InterleaveFactor; i++) {
2832       // Interleaved store group doesn't allow a gap, so each index has a member
2833       assert(Group->getMember(i) && "Fail to get a member from an interleaved store group");
2834 
2835       Value *StoredVec = State.get(StoredValues[i], Part);
2836 
2837       if (Group->isReverse())
2838         StoredVec = reverseVector(StoredVec);
2839 
2840       // If this member has different type, cast it to a unified type.
2841 
2842       if (StoredVec->getType() != SubVT)
2843         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2844 
2845       StoredVecs.push_back(StoredVec);
2846     }
2847 
2848     // Concatenate all vectors into a wide vector.
2849     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2850 
2851     // Interleave the elements in the wide vector.
2852     Value *IVec = Builder.CreateShuffleVector(
2853         WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2854         "interleaved.vec");
2855 
2856     Instruction *NewStoreInstr;
2857     if (BlockInMask) {
2858       Value *BlockInMaskPart = State.get(BlockInMask, Part);
2859       Value *ShuffledMask = Builder.CreateShuffleVector(
2860           BlockInMaskPart,
2861           createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2862           "interleaved.mask");
2863       NewStoreInstr = Builder.CreateMaskedStore(
2864           IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
2865     }
2866     else
2867       NewStoreInstr =
2868           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2869 
2870     Group->addMetadata(NewStoreInstr);
2871   }
2872 }
2873 
2874 void InnerLoopVectorizer::vectorizeMemoryInstruction(
2875     Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr,
2876     VPValue *StoredValue, VPValue *BlockInMask) {
2877   // Attempt to issue a wide load.
2878   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2879   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2880 
2881   assert((LI || SI) && "Invalid Load/Store instruction");
2882   assert((!SI || StoredValue) && "No stored value provided for widened store");
2883   assert((!LI || !StoredValue) && "Stored value provided for widened load");
2884 
2885   LoopVectorizationCostModel::InstWidening Decision =
2886       Cost->getWideningDecision(Instr, VF);
2887   assert((Decision == LoopVectorizationCostModel::CM_Widen ||
2888           Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||
2889           Decision == LoopVectorizationCostModel::CM_GatherScatter) &&
2890          "CM decision is not to widen the memory instruction");
2891 
2892   Type *ScalarDataTy = getLoadStoreType(Instr);
2893 
2894   auto *DataTy = VectorType::get(ScalarDataTy, VF);
2895   const Align Alignment = getLoadStoreAlignment(Instr);
2896 
2897   // Determine if the pointer operand of the access is either consecutive or
2898   // reverse consecutive.
2899   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2900   bool ConsecutiveStride =
2901       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2902   bool CreateGatherScatter =
2903       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2904 
2905   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2906   // gather/scatter. Otherwise Decision should have been to Scalarize.
2907   assert((ConsecutiveStride || CreateGatherScatter) &&
2908          "The instruction should be scalarized");
2909   (void)ConsecutiveStride;
2910 
2911   VectorParts BlockInMaskParts(UF);
2912   bool isMaskRequired = BlockInMask;
2913   if (isMaskRequired)
2914     for (unsigned Part = 0; Part < UF; ++Part)
2915       BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2916 
2917   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2918     // Calculate the pointer for the specific unroll-part.
2919     GetElementPtrInst *PartPtr = nullptr;
2920 
2921     bool InBounds = false;
2922     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2923       InBounds = gep->isInBounds();
2924     if (Reverse) {
2925       // If the address is consecutive but reversed, then the
2926       // wide store needs to start at the last vector element.
2927       // RunTimeVF =  VScale * VF.getKnownMinValue()
2928       // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
2929       Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF);
2930       // NumElt = -Part * RunTimeVF
2931       Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
2932       // LastLane = 1 - RunTimeVF
2933       Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
2934       PartPtr =
2935           cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
2936       PartPtr->setIsInBounds(InBounds);
2937       PartPtr = cast<GetElementPtrInst>(
2938           Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
2939       PartPtr->setIsInBounds(InBounds);
2940       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2941         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2942     } else {
2943       Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF);
2944       PartPtr = cast<GetElementPtrInst>(
2945           Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
2946       PartPtr->setIsInBounds(InBounds);
2947     }
2948 
2949     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2950     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2951   };
2952 
2953   // Handle Stores:
2954   if (SI) {
2955     setDebugLocFromInst(SI);
2956 
2957     for (unsigned Part = 0; Part < UF; ++Part) {
2958       Instruction *NewSI = nullptr;
2959       Value *StoredVal = State.get(StoredValue, Part);
2960       if (CreateGatherScatter) {
2961         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2962         Value *VectorGep = State.get(Addr, Part);
2963         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2964                                             MaskPart);
2965       } else {
2966         if (Reverse) {
2967           // If we store to reverse consecutive memory locations, then we need
2968           // to reverse the order of elements in the stored value.
2969           StoredVal = reverseVector(StoredVal);
2970           // We don't want to update the value in the map as it might be used in
2971           // another expression. So don't call resetVectorValue(StoredVal).
2972         }
2973         auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0)));
2974         if (isMaskRequired)
2975           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2976                                             BlockInMaskParts[Part]);
2977         else
2978           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2979       }
2980       addMetadata(NewSI, SI);
2981     }
2982     return;
2983   }
2984 
2985   // Handle loads.
2986   assert(LI && "Must have a load instruction");
2987   setDebugLocFromInst(LI);
2988   for (unsigned Part = 0; Part < UF; ++Part) {
2989     Value *NewLI;
2990     if (CreateGatherScatter) {
2991       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2992       Value *VectorGep = State.get(Addr, Part);
2993       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2994                                          nullptr, "wide.masked.gather");
2995       addMetadata(NewLI, LI);
2996     } else {
2997       auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0)));
2998       if (isMaskRequired)
2999         NewLI = Builder.CreateMaskedLoad(
3000             VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy),
3001             "wide.masked.load");
3002       else
3003         NewLI =
3004             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
3005 
3006       // Add metadata to the load, but setVectorValue to the reverse shuffle.
3007       addMetadata(NewLI, LI);
3008       if (Reverse)
3009         NewLI = reverseVector(NewLI);
3010     }
3011 
3012     State.set(Def, NewLI, Part);
3013   }
3014 }
3015 
3016 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def,
3017                                                VPUser &User,
3018                                                const VPIteration &Instance,
3019                                                bool IfPredicateInstr,
3020                                                VPTransformState &State) {
3021   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
3022 
3023   // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
3024   // the first lane and part.
3025   if (isa<NoAliasScopeDeclInst>(Instr))
3026     if (!Instance.isFirstIteration())
3027       return;
3028 
3029   setDebugLocFromInst(Instr);
3030 
3031   // Does this instruction return a value ?
3032   bool IsVoidRetTy = Instr->getType()->isVoidTy();
3033 
3034   Instruction *Cloned = Instr->clone();
3035   if (!IsVoidRetTy)
3036     Cloned->setName(Instr->getName() + ".cloned");
3037 
3038   State.Builder.SetInsertPoint(Builder.GetInsertBlock(),
3039                                Builder.GetInsertPoint());
3040   // Replace the operands of the cloned instructions with their scalar
3041   // equivalents in the new loop.
3042   for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
3043     auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op));
3044     auto InputInstance = Instance;
3045     if (!Operand || !OrigLoop->contains(Operand) ||
3046         (Cost->isUniformAfterVectorization(Operand, State.VF)))
3047       InputInstance.Lane = VPLane::getFirstLane();
3048     auto *NewOp = State.get(User.getOperand(op), InputInstance);
3049     Cloned->setOperand(op, NewOp);
3050   }
3051   addNewMetadata(Cloned, Instr);
3052 
3053   // Place the cloned scalar in the new loop.
3054   Builder.Insert(Cloned);
3055 
3056   State.set(Def, Cloned, Instance);
3057 
3058   // If we just cloned a new assumption, add it the assumption cache.
3059   if (auto *II = dyn_cast<AssumeInst>(Cloned))
3060     AC->registerAssumption(II);
3061 
3062   // End if-block.
3063   if (IfPredicateInstr)
3064     PredicatedInstructions.push_back(Cloned);
3065 }
3066 
3067 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
3068                                                       Value *End, Value *Step,
3069                                                       Instruction *DL) {
3070   BasicBlock *Header = L->getHeader();
3071   BasicBlock *Latch = L->getLoopLatch();
3072   // As we're just creating this loop, it's possible no latch exists
3073   // yet. If so, use the header as this will be a single block loop.
3074   if (!Latch)
3075     Latch = Header;
3076 
3077   IRBuilder<> B(&*Header->getFirstInsertionPt());
3078   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
3079   setDebugLocFromInst(OldInst, &B);
3080   auto *Induction = B.CreatePHI(Start->getType(), 2, "index");
3081 
3082   B.SetInsertPoint(Latch->getTerminator());
3083   setDebugLocFromInst(OldInst, &B);
3084 
3085   // Create i+1 and fill the PHINode.
3086   //
3087   // If the tail is not folded, we know that End - Start >= Step (either
3088   // statically or through the minimum iteration checks). We also know that both
3089   // Start % Step == 0 and End % Step == 0. We exit the vector loop if %IV +
3090   // %Step == %End. Hence we must exit the loop before %IV + %Step unsigned
3091   // overflows and we can mark the induction increment as NUW.
3092   Value *Next = B.CreateAdd(Induction, Step, "index.next",
3093                             /*NUW=*/!Cost->foldTailByMasking(), /*NSW=*/false);
3094   Induction->addIncoming(Start, L->getLoopPreheader());
3095   Induction->addIncoming(Next, Latch);
3096   // Create the compare.
3097   Value *ICmp = B.CreateICmpEQ(Next, End);
3098   B.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header);
3099 
3100   // Now we have two terminators. Remove the old one from the block.
3101   Latch->getTerminator()->eraseFromParent();
3102 
3103   return Induction;
3104 }
3105 
3106 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
3107   if (TripCount)
3108     return TripCount;
3109 
3110   assert(L && "Create Trip Count for null loop.");
3111   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3112   // Find the loop boundaries.
3113   ScalarEvolution *SE = PSE.getSE();
3114   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
3115   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
3116          "Invalid loop count");
3117 
3118   Type *IdxTy = Legal->getWidestInductionType();
3119   assert(IdxTy && "No type for induction");
3120 
3121   // The exit count might have the type of i64 while the phi is i32. This can
3122   // happen if we have an induction variable that is sign extended before the
3123   // compare. The only way that we get a backedge taken count is that the
3124   // induction variable was signed and as such will not overflow. In such a case
3125   // truncation is legal.
3126   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
3127       IdxTy->getPrimitiveSizeInBits())
3128     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
3129   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
3130 
3131   // Get the total trip count from the count by adding 1.
3132   const SCEV *ExitCount = SE->getAddExpr(
3133       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
3134 
3135   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
3136 
3137   // Expand the trip count and place the new instructions in the preheader.
3138   // Notice that the pre-header does not change, only the loop body.
3139   SCEVExpander Exp(*SE, DL, "induction");
3140 
3141   // Count holds the overall loop count (N).
3142   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
3143                                 L->getLoopPreheader()->getTerminator());
3144 
3145   if (TripCount->getType()->isPointerTy())
3146     TripCount =
3147         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
3148                                     L->getLoopPreheader()->getTerminator());
3149 
3150   return TripCount;
3151 }
3152 
3153 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
3154   if (VectorTripCount)
3155     return VectorTripCount;
3156 
3157   Value *TC = getOrCreateTripCount(L);
3158   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3159 
3160   Type *Ty = TC->getType();
3161   // This is where we can make the step a runtime constant.
3162   Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF);
3163 
3164   // If the tail is to be folded by masking, round the number of iterations N
3165   // up to a multiple of Step instead of rounding down. This is done by first
3166   // adding Step-1 and then rounding down. Note that it's ok if this addition
3167   // overflows: the vector induction variable will eventually wrap to zero given
3168   // that it starts at zero and its Step is a power of two; the loop will then
3169   // exit, with the last early-exit vector comparison also producing all-true.
3170   if (Cost->foldTailByMasking()) {
3171     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
3172            "VF*UF must be a power of 2 when folding tail by masking");
3173     assert(!VF.isScalable() &&
3174            "Tail folding not yet supported for scalable vectors");
3175     TC = Builder.CreateAdd(
3176         TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
3177   }
3178 
3179   // Now we need to generate the expression for the part of the loop that the
3180   // vectorized body will execute. This is equal to N - (N % Step) if scalar
3181   // iterations are not required for correctness, or N - Step, otherwise. Step
3182   // is equal to the vectorization factor (number of SIMD elements) times the
3183   // unroll factor (number of SIMD instructions).
3184   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
3185 
3186   // There are cases where we *must* run at least one iteration in the remainder
3187   // loop.  See the cost model for when this can happen.  If the step evenly
3188   // divides the trip count, we set the remainder to be equal to the step. If
3189   // the step does not evenly divide the trip count, no adjustment is necessary
3190   // since there will already be scalar iterations. Note that the minimum
3191   // iterations check ensures that N >= Step.
3192   if (Cost->requiresScalarEpilogue(VF)) {
3193     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
3194     R = Builder.CreateSelect(IsZero, Step, R);
3195   }
3196 
3197   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
3198 
3199   return VectorTripCount;
3200 }
3201 
3202 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
3203                                                    const DataLayout &DL) {
3204   // Verify that V is a vector type with same number of elements as DstVTy.
3205   auto *DstFVTy = cast<FixedVectorType>(DstVTy);
3206   unsigned VF = DstFVTy->getNumElements();
3207   auto *SrcVecTy = cast<FixedVectorType>(V->getType());
3208   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
3209   Type *SrcElemTy = SrcVecTy->getElementType();
3210   Type *DstElemTy = DstFVTy->getElementType();
3211   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
3212          "Vector elements must have same size");
3213 
3214   // Do a direct cast if element types are castable.
3215   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
3216     return Builder.CreateBitOrPointerCast(V, DstFVTy);
3217   }
3218   // V cannot be directly casted to desired vector type.
3219   // May happen when V is a floating point vector but DstVTy is a vector of
3220   // pointers or vice-versa. Handle this using a two-step bitcast using an
3221   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3222   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
3223          "Only one type should be a pointer type");
3224   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
3225          "Only one type should be a floating point type");
3226   Type *IntTy =
3227       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
3228   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
3229   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
3230   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
3231 }
3232 
3233 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
3234                                                          BasicBlock *Bypass) {
3235   Value *Count = getOrCreateTripCount(L);
3236   // Reuse existing vector loop preheader for TC checks.
3237   // Note that new preheader block is generated for vector loop.
3238   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
3239   IRBuilder<> Builder(TCCheckBlock->getTerminator());
3240 
3241   // Generate code to check if the loop's trip count is less than VF * UF, or
3242   // equal to it in case a scalar epilogue is required; this implies that the
3243   // vector trip count is zero. This check also covers the case where adding one
3244   // to the backedge-taken count overflowed leading to an incorrect trip count
3245   // of zero. In this case we will also jump to the scalar loop.
3246   auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE
3247                                             : ICmpInst::ICMP_ULT;
3248 
3249   // If tail is to be folded, vector loop takes care of all iterations.
3250   Value *CheckMinIters = Builder.getFalse();
3251   if (!Cost->foldTailByMasking()) {
3252     Value *Step =
3253         createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF);
3254     CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
3255   }
3256   // Create new preheader for vector loop.
3257   LoopVectorPreHeader =
3258       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
3259                  "vector.ph");
3260 
3261   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
3262                                DT->getNode(Bypass)->getIDom()) &&
3263          "TC check is expected to dominate Bypass");
3264 
3265   // Update dominator for Bypass & LoopExit.
3266   DT->changeImmediateDominator(Bypass, TCCheckBlock);
3267   DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3268 
3269   ReplaceInstWithInst(
3270       TCCheckBlock->getTerminator(),
3271       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3272   LoopBypassBlocks.push_back(TCCheckBlock);
3273 }
3274 
3275 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
3276 
3277   BasicBlock *const SCEVCheckBlock =
3278       RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock);
3279   if (!SCEVCheckBlock)
3280     return nullptr;
3281 
3282   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
3283            (OptForSizeBasedOnProfile &&
3284             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
3285          "Cannot SCEV check stride or overflow when optimizing for size");
3286 
3287 
3288   // Update dominator only if this is first RT check.
3289   if (LoopBypassBlocks.empty()) {
3290     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3291     DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3292   }
3293 
3294   LoopBypassBlocks.push_back(SCEVCheckBlock);
3295   AddedSafetyChecks = true;
3296   return SCEVCheckBlock;
3297 }
3298 
3299 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L,
3300                                                       BasicBlock *Bypass) {
3301   // VPlan-native path does not do any analysis for runtime checks currently.
3302   if (EnableVPlanNativePath)
3303     return nullptr;
3304 
3305   BasicBlock *const MemCheckBlock =
3306       RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader);
3307 
3308   // Check if we generated code that checks in runtime if arrays overlap. We put
3309   // the checks into a separate block to make the more common case of few
3310   // elements faster.
3311   if (!MemCheckBlock)
3312     return nullptr;
3313 
3314   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3315     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
3316            "Cannot emit memory checks when optimizing for size, unless forced "
3317            "to vectorize.");
3318     ORE->emit([&]() {
3319       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
3320                                         L->getStartLoc(), L->getHeader())
3321              << "Code-size may be reduced by not forcing "
3322                 "vectorization, or by source-code modifications "
3323                 "eliminating the need for runtime checks "
3324                 "(e.g., adding 'restrict').";
3325     });
3326   }
3327 
3328   LoopBypassBlocks.push_back(MemCheckBlock);
3329 
3330   AddedSafetyChecks = true;
3331 
3332   // We currently don't use LoopVersioning for the actual loop cloning but we
3333   // still use it to add the noalias metadata.
3334   LVer = std::make_unique<LoopVersioning>(
3335       *Legal->getLAI(),
3336       Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
3337       DT, PSE.getSE());
3338   LVer->prepareNoAliasMetadata();
3339   return MemCheckBlock;
3340 }
3341 
3342 Value *InnerLoopVectorizer::emitTransformedIndex(
3343     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
3344     const InductionDescriptor &ID) const {
3345 
3346   SCEVExpander Exp(*SE, DL, "induction");
3347   auto Step = ID.getStep();
3348   auto StartValue = ID.getStartValue();
3349   assert(Index->getType()->getScalarType() == Step->getType() &&
3350          "Index scalar type does not match StepValue type");
3351 
3352   // Note: the IR at this point is broken. We cannot use SE to create any new
3353   // SCEV and then expand it, hoping that SCEV's simplification will give us
3354   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
3355   // lead to various SCEV crashes. So all we can do is to use builder and rely
3356   // on InstCombine for future simplifications. Here we handle some trivial
3357   // cases only.
3358   auto CreateAdd = [&B](Value *X, Value *Y) {
3359     assert(X->getType() == Y->getType() && "Types don't match!");
3360     if (auto *CX = dyn_cast<ConstantInt>(X))
3361       if (CX->isZero())
3362         return Y;
3363     if (auto *CY = dyn_cast<ConstantInt>(Y))
3364       if (CY->isZero())
3365         return X;
3366     return B.CreateAdd(X, Y);
3367   };
3368 
3369   // We allow X to be a vector type, in which case Y will potentially be
3370   // splatted into a vector with the same element count.
3371   auto CreateMul = [&B](Value *X, Value *Y) {
3372     assert(X->getType()->getScalarType() == Y->getType() &&
3373            "Types don't match!");
3374     if (auto *CX = dyn_cast<ConstantInt>(X))
3375       if (CX->isOne())
3376         return Y;
3377     if (auto *CY = dyn_cast<ConstantInt>(Y))
3378       if (CY->isOne())
3379         return X;
3380     VectorType *XVTy = dyn_cast<VectorType>(X->getType());
3381     if (XVTy && !isa<VectorType>(Y->getType()))
3382       Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
3383     return B.CreateMul(X, Y);
3384   };
3385 
3386   // Get a suitable insert point for SCEV expansion. For blocks in the vector
3387   // loop, choose the end of the vector loop header (=LoopVectorBody), because
3388   // the DomTree is not kept up-to-date for additional blocks generated in the
3389   // vector loop. By using the header as insertion point, we guarantee that the
3390   // expanded instructions dominate all their uses.
3391   auto GetInsertPoint = [this, &B]() {
3392     BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
3393     if (InsertBB != LoopVectorBody &&
3394         LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
3395       return LoopVectorBody->getTerminator();
3396     return &*B.GetInsertPoint();
3397   };
3398 
3399   switch (ID.getKind()) {
3400   case InductionDescriptor::IK_IntInduction: {
3401     assert(!isa<VectorType>(Index->getType()) &&
3402            "Vector indices not supported for integer inductions yet");
3403     assert(Index->getType() == StartValue->getType() &&
3404            "Index type does not match StartValue type");
3405     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
3406       return B.CreateSub(StartValue, Index);
3407     auto *Offset = CreateMul(
3408         Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
3409     return CreateAdd(StartValue, Offset);
3410   }
3411   case InductionDescriptor::IK_PtrInduction: {
3412     assert(isa<SCEVConstant>(Step) &&
3413            "Expected constant step for pointer induction");
3414     return B.CreateGEP(
3415         StartValue->getType()->getPointerElementType(), StartValue,
3416         CreateMul(Index,
3417                   Exp.expandCodeFor(Step, Index->getType()->getScalarType(),
3418                                     GetInsertPoint())));
3419   }
3420   case InductionDescriptor::IK_FpInduction: {
3421     assert(!isa<VectorType>(Index->getType()) &&
3422            "Vector indices not supported for FP inductions yet");
3423     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
3424     auto InductionBinOp = ID.getInductionBinOp();
3425     assert(InductionBinOp &&
3426            (InductionBinOp->getOpcode() == Instruction::FAdd ||
3427             InductionBinOp->getOpcode() == Instruction::FSub) &&
3428            "Original bin op should be defined for FP induction");
3429 
3430     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
3431     Value *MulExp = B.CreateFMul(StepValue, Index);
3432     return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
3433                          "induction");
3434   }
3435   case InductionDescriptor::IK_NoInduction:
3436     return nullptr;
3437   }
3438   llvm_unreachable("invalid enum");
3439 }
3440 
3441 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3442   LoopScalarBody = OrigLoop->getHeader();
3443   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3444   LoopExitBlock = OrigLoop->getUniqueExitBlock();
3445   assert(LoopExitBlock && "Must have an exit block");
3446   assert(LoopVectorPreHeader && "Invalid loop structure");
3447 
3448   LoopMiddleBlock =
3449       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3450                  LI, nullptr, Twine(Prefix) + "middle.block");
3451   LoopScalarPreHeader =
3452       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3453                  nullptr, Twine(Prefix) + "scalar.ph");
3454 
3455   // Set up branch from middle block to the exit and scalar preheader blocks.
3456   // completeLoopSkeleton will update the condition to use an iteration check,
3457   // if required to decide whether to execute the remainder.
3458   BranchInst *BrInst =
3459       BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue());
3460   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3461   BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3462   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3463 
3464   // We intentionally don't let SplitBlock to update LoopInfo since
3465   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3466   // LoopVectorBody is explicitly added to the correct place few lines later.
3467   LoopVectorBody =
3468       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3469                  nullptr, nullptr, Twine(Prefix) + "vector.body");
3470 
3471   // Update dominator for loop exit.
3472   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3473 
3474   // Create and register the new vector loop.
3475   Loop *Lp = LI->AllocateLoop();
3476   Loop *ParentLoop = OrigLoop->getParentLoop();
3477 
3478   // Insert the new loop into the loop nest and register the new basic blocks
3479   // before calling any utilities such as SCEV that require valid LoopInfo.
3480   if (ParentLoop) {
3481     ParentLoop->addChildLoop(Lp);
3482   } else {
3483     LI->addTopLevelLoop(Lp);
3484   }
3485   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3486   return Lp;
3487 }
3488 
3489 void InnerLoopVectorizer::createInductionResumeValues(
3490     Loop *L, Value *VectorTripCount,
3491     std::pair<BasicBlock *, Value *> AdditionalBypass) {
3492   assert(VectorTripCount && L && "Expected valid arguments");
3493   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3494           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3495          "Inconsistent information about additional bypass.");
3496   // We are going to resume the execution of the scalar loop.
3497   // Go over all of the induction variables that we found and fix the
3498   // PHIs that are left in the scalar version of the loop.
3499   // The starting values of PHI nodes depend on the counter of the last
3500   // iteration in the vectorized loop.
3501   // If we come from a bypass edge then we need to start from the original
3502   // start value.
3503   for (auto &InductionEntry : Legal->getInductionVars()) {
3504     PHINode *OrigPhi = InductionEntry.first;
3505     InductionDescriptor II = InductionEntry.second;
3506 
3507     // Create phi nodes to merge from the  backedge-taken check block.
3508     PHINode *BCResumeVal =
3509         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3510                         LoopScalarPreHeader->getTerminator());
3511     // Copy original phi DL over to the new one.
3512     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3513     Value *&EndValue = IVEndValues[OrigPhi];
3514     Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3515     if (OrigPhi == OldInduction) {
3516       // We know what the end value is.
3517       EndValue = VectorTripCount;
3518     } else {
3519       IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3520 
3521       // Fast-math-flags propagate from the original induction instruction.
3522       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3523         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3524 
3525       Type *StepType = II.getStep()->getType();
3526       Instruction::CastOps CastOp =
3527           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3528       Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3529       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3530       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3531       EndValue->setName("ind.end");
3532 
3533       // Compute the end value for the additional bypass (if applicable).
3534       if (AdditionalBypass.first) {
3535         B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3536         CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
3537                                          StepType, true);
3538         CRD =
3539             B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
3540         EndValueFromAdditionalBypass =
3541             emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3542         EndValueFromAdditionalBypass->setName("ind.end");
3543       }
3544     }
3545     // The new PHI merges the original incoming value, in case of a bypass,
3546     // or the value at the end of the vectorized loop.
3547     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3548 
3549     // Fix the scalar body counter (PHI node).
3550     // The old induction's phi node in the scalar body needs the truncated
3551     // value.
3552     for (BasicBlock *BB : LoopBypassBlocks)
3553       BCResumeVal->addIncoming(II.getStartValue(), BB);
3554 
3555     if (AdditionalBypass.first)
3556       BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3557                                             EndValueFromAdditionalBypass);
3558 
3559     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3560   }
3561 }
3562 
3563 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3564                                                       MDNode *OrigLoopID) {
3565   assert(L && "Expected valid loop.");
3566 
3567   // The trip counts should be cached by now.
3568   Value *Count = getOrCreateTripCount(L);
3569   Value *VectorTripCount = getOrCreateVectorTripCount(L);
3570 
3571   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3572 
3573   // Add a check in the middle block to see if we have completed
3574   // all of the iterations in the first vector loop.
3575   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3576   // If tail is to be folded, we know we don't need to run the remainder.
3577   if (!Cost->foldTailByMasking()) {
3578     Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3579                                         Count, VectorTripCount, "cmp.n",
3580                                         LoopMiddleBlock->getTerminator());
3581 
3582     // Here we use the same DebugLoc as the scalar loop latch terminator instead
3583     // of the corresponding compare because they may have ended up with
3584     // different line numbers and we want to avoid awkward line stepping while
3585     // debugging. Eg. if the compare has got a line number inside the loop.
3586     CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3587     cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3588   }
3589 
3590   // Get ready to start creating new instructions into the vectorized body.
3591   assert(LoopVectorPreHeader == L->getLoopPreheader() &&
3592          "Inconsistent vector loop preheader");
3593   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3594 
3595   Optional<MDNode *> VectorizedLoopID =
3596       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3597                                       LLVMLoopVectorizeFollowupVectorized});
3598   if (VectorizedLoopID.hasValue()) {
3599     L->setLoopID(VectorizedLoopID.getValue());
3600 
3601     // Do not setAlreadyVectorized if loop attributes have been defined
3602     // explicitly.
3603     return LoopVectorPreHeader;
3604   }
3605 
3606   // Keep all loop hints from the original loop on the vector loop (we'll
3607   // replace the vectorizer-specific hints below).
3608   if (MDNode *LID = OrigLoop->getLoopID())
3609     L->setLoopID(LID);
3610 
3611   LoopVectorizeHints Hints(L, true, *ORE);
3612   Hints.setAlreadyVectorized();
3613 
3614 #ifdef EXPENSIVE_CHECKS
3615   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3616   LI->verify(*DT);
3617 #endif
3618 
3619   return LoopVectorPreHeader;
3620 }
3621 
3622 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3623   /*
3624    In this function we generate a new loop. The new loop will contain
3625    the vectorized instructions while the old loop will continue to run the
3626    scalar remainder.
3627 
3628        [ ] <-- loop iteration number check.
3629     /   |
3630    /    v
3631   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3632   |  /  |
3633   | /   v
3634   ||   [ ]     <-- vector pre header.
3635   |/    |
3636   |     v
3637   |    [  ] \
3638   |    [  ]_|   <-- vector loop.
3639   |     |
3640   |     v
3641   |   -[ ]   <--- middle-block.
3642   |  /  |
3643   | /   v
3644   -|- >[ ]     <--- new preheader.
3645    |    |
3646    |    v
3647    |   [ ] \
3648    |   [ ]_|   <-- old scalar loop to handle remainder.
3649     \   |
3650      \  v
3651       >[ ]     <-- exit block.
3652    ...
3653    */
3654 
3655   // Get the metadata of the original loop before it gets modified.
3656   MDNode *OrigLoopID = OrigLoop->getLoopID();
3657 
3658   // Workaround!  Compute the trip count of the original loop and cache it
3659   // before we start modifying the CFG.  This code has a systemic problem
3660   // wherein it tries to run analysis over partially constructed IR; this is
3661   // wrong, and not simply for SCEV.  The trip count of the original loop
3662   // simply happens to be prone to hitting this in practice.  In theory, we
3663   // can hit the same issue for any SCEV, or ValueTracking query done during
3664   // mutation.  See PR49900.
3665   getOrCreateTripCount(OrigLoop);
3666 
3667   // Create an empty vector loop, and prepare basic blocks for the runtime
3668   // checks.
3669   Loop *Lp = createVectorLoopSkeleton("");
3670 
3671   // Now, compare the new count to zero. If it is zero skip the vector loop and
3672   // jump to the scalar loop. This check also covers the case where the
3673   // backedge-taken count is uint##_max: adding one to it will overflow leading
3674   // to an incorrect trip count of zero. In this (rare) case we will also jump
3675   // to the scalar loop.
3676   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3677 
3678   // Generate the code to check any assumptions that we've made for SCEV
3679   // expressions.
3680   emitSCEVChecks(Lp, LoopScalarPreHeader);
3681 
3682   // Generate the code that checks in runtime if arrays overlap. We put the
3683   // checks into a separate block to make the more common case of few elements
3684   // faster.
3685   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3686 
3687   // Some loops have a single integer induction variable, while other loops
3688   // don't. One example is c++ iterators that often have multiple pointer
3689   // induction variables. In the code below we also support a case where we
3690   // don't have a single induction variable.
3691   //
3692   // We try to obtain an induction variable from the original loop as hard
3693   // as possible. However if we don't find one that:
3694   //   - is an integer
3695   //   - counts from zero, stepping by one
3696   //   - is the size of the widest induction variable type
3697   // then we create a new one.
3698   OldInduction = Legal->getPrimaryInduction();
3699   Type *IdxTy = Legal->getWidestInductionType();
3700   Value *StartIdx = ConstantInt::get(IdxTy, 0);
3701   // The loop step is equal to the vectorization factor (num of SIMD elements)
3702   // times the unroll factor (num of SIMD instructions).
3703   Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt());
3704   Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF);
3705   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3706   Induction =
3707       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3708                               getDebugLocFromInstOrOperands(OldInduction));
3709 
3710   // Emit phis for the new starting index of the scalar loop.
3711   createInductionResumeValues(Lp, CountRoundDown);
3712 
3713   return completeLoopSkeleton(Lp, OrigLoopID);
3714 }
3715 
3716 // Fix up external users of the induction variable. At this point, we are
3717 // in LCSSA form, with all external PHIs that use the IV having one input value,
3718 // coming from the remainder loop. We need those PHIs to also have a correct
3719 // value for the IV when arriving directly from the middle block.
3720 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3721                                        const InductionDescriptor &II,
3722                                        Value *CountRoundDown, Value *EndValue,
3723                                        BasicBlock *MiddleBlock) {
3724   // There are two kinds of external IV usages - those that use the value
3725   // computed in the last iteration (the PHI) and those that use the penultimate
3726   // value (the value that feeds into the phi from the loop latch).
3727   // We allow both, but they, obviously, have different values.
3728 
3729   assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3730 
3731   DenseMap<Value *, Value *> MissingVals;
3732 
3733   // An external user of the last iteration's value should see the value that
3734   // the remainder loop uses to initialize its own IV.
3735   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3736   for (User *U : PostInc->users()) {
3737     Instruction *UI = cast<Instruction>(U);
3738     if (!OrigLoop->contains(UI)) {
3739       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3740       MissingVals[UI] = EndValue;
3741     }
3742   }
3743 
3744   // An external user of the penultimate value need to see EndValue - Step.
3745   // The simplest way to get this is to recompute it from the constituent SCEVs,
3746   // that is Start + (Step * (CRD - 1)).
3747   for (User *U : OrigPhi->users()) {
3748     auto *UI = cast<Instruction>(U);
3749     if (!OrigLoop->contains(UI)) {
3750       const DataLayout &DL =
3751           OrigLoop->getHeader()->getModule()->getDataLayout();
3752       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3753 
3754       IRBuilder<> B(MiddleBlock->getTerminator());
3755 
3756       // Fast-math-flags propagate from the original induction instruction.
3757       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3758         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3759 
3760       Value *CountMinusOne = B.CreateSub(
3761           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3762       Value *CMO =
3763           !II.getStep()->getType()->isIntegerTy()
3764               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3765                              II.getStep()->getType())
3766               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3767       CMO->setName("cast.cmo");
3768       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3769       Escape->setName("ind.escape");
3770       MissingVals[UI] = Escape;
3771     }
3772   }
3773 
3774   for (auto &I : MissingVals) {
3775     PHINode *PHI = cast<PHINode>(I.first);
3776     // One corner case we have to handle is two IVs "chasing" each-other,
3777     // that is %IV2 = phi [...], [ %IV1, %latch ]
3778     // In this case, if IV1 has an external use, we need to avoid adding both
3779     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3780     // don't already have an incoming value for the middle block.
3781     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3782       PHI->addIncoming(I.second, MiddleBlock);
3783   }
3784 }
3785 
3786 namespace {
3787 
3788 struct CSEDenseMapInfo {
3789   static bool canHandle(const Instruction *I) {
3790     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3791            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3792   }
3793 
3794   static inline Instruction *getEmptyKey() {
3795     return DenseMapInfo<Instruction *>::getEmptyKey();
3796   }
3797 
3798   static inline Instruction *getTombstoneKey() {
3799     return DenseMapInfo<Instruction *>::getTombstoneKey();
3800   }
3801 
3802   static unsigned getHashValue(const Instruction *I) {
3803     assert(canHandle(I) && "Unknown instruction!");
3804     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3805                                                            I->value_op_end()));
3806   }
3807 
3808   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3809     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3810         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3811       return LHS == RHS;
3812     return LHS->isIdenticalTo(RHS);
3813   }
3814 };
3815 
3816 } // end anonymous namespace
3817 
3818 ///Perform cse of induction variable instructions.
3819 static void cse(BasicBlock *BB) {
3820   // Perform simple cse.
3821   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3822   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3823     Instruction *In = &*I++;
3824 
3825     if (!CSEDenseMapInfo::canHandle(In))
3826       continue;
3827 
3828     // Check if we can replace this instruction with any of the
3829     // visited instructions.
3830     if (Instruction *V = CSEMap.lookup(In)) {
3831       In->replaceAllUsesWith(V);
3832       In->eraseFromParent();
3833       continue;
3834     }
3835 
3836     CSEMap[In] = In;
3837   }
3838 }
3839 
3840 InstructionCost
3841 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
3842                                               bool &NeedToScalarize) const {
3843   Function *F = CI->getCalledFunction();
3844   Type *ScalarRetTy = CI->getType();
3845   SmallVector<Type *, 4> Tys, ScalarTys;
3846   for (auto &ArgOp : CI->arg_operands())
3847     ScalarTys.push_back(ArgOp->getType());
3848 
3849   // Estimate cost of scalarized vector call. The source operands are assumed
3850   // to be vectors, so we need to extract individual elements from there,
3851   // execute VF scalar calls, and then gather the result into the vector return
3852   // value.
3853   InstructionCost ScalarCallCost =
3854       TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
3855   if (VF.isScalar())
3856     return ScalarCallCost;
3857 
3858   // Compute corresponding vector type for return value and arguments.
3859   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3860   for (Type *ScalarTy : ScalarTys)
3861     Tys.push_back(ToVectorTy(ScalarTy, VF));
3862 
3863   // Compute costs of unpacking argument values for the scalar calls and
3864   // packing the return values to a vector.
3865   InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
3866 
3867   InstructionCost Cost =
3868       ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3869 
3870   // If we can't emit a vector call for this function, then the currently found
3871   // cost is the cost we need to return.
3872   NeedToScalarize = true;
3873   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3874   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3875 
3876   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3877     return Cost;
3878 
3879   // If the corresponding vector cost is cheaper, return its cost.
3880   InstructionCost VectorCallCost =
3881       TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
3882   if (VectorCallCost < Cost) {
3883     NeedToScalarize = false;
3884     Cost = VectorCallCost;
3885   }
3886   return Cost;
3887 }
3888 
3889 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3890   if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3891     return Elt;
3892   return VectorType::get(Elt, VF);
3893 }
3894 
3895 InstructionCost
3896 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3897                                                    ElementCount VF) const {
3898   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3899   assert(ID && "Expected intrinsic call!");
3900   Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3901   FastMathFlags FMF;
3902   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3903     FMF = FPMO->getFastMathFlags();
3904 
3905   SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end());
3906   FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3907   SmallVector<Type *> ParamTys;
3908   std::transform(FTy->param_begin(), FTy->param_end(),
3909                  std::back_inserter(ParamTys),
3910                  [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3911 
3912   IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3913                                     dyn_cast<IntrinsicInst>(CI));
3914   return TTI.getIntrinsicInstrCost(CostAttrs,
3915                                    TargetTransformInfo::TCK_RecipThroughput);
3916 }
3917 
3918 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3919   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3920   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3921   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3922 }
3923 
3924 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3925   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3926   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3927   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3928 }
3929 
3930 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
3931   // For every instruction `I` in MinBWs, truncate the operands, create a
3932   // truncated version of `I` and reextend its result. InstCombine runs
3933   // later and will remove any ext/trunc pairs.
3934   SmallPtrSet<Value *, 4> Erased;
3935   for (const auto &KV : Cost->getMinimalBitwidths()) {
3936     // If the value wasn't vectorized, we must maintain the original scalar
3937     // type. The absence of the value from State indicates that it
3938     // wasn't vectorized.
3939     VPValue *Def = State.Plan->getVPValue(KV.first);
3940     if (!State.hasAnyVectorValue(Def))
3941       continue;
3942     for (unsigned Part = 0; Part < UF; ++Part) {
3943       Value *I = State.get(Def, Part);
3944       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3945         continue;
3946       Type *OriginalTy = I->getType();
3947       Type *ScalarTruncatedTy =
3948           IntegerType::get(OriginalTy->getContext(), KV.second);
3949       auto *TruncatedTy = FixedVectorType::get(
3950           ScalarTruncatedTy,
3951           cast<FixedVectorType>(OriginalTy)->getNumElements());
3952       if (TruncatedTy == OriginalTy)
3953         continue;
3954 
3955       IRBuilder<> B(cast<Instruction>(I));
3956       auto ShrinkOperand = [&](Value *V) -> Value * {
3957         if (auto *ZI = dyn_cast<ZExtInst>(V))
3958           if (ZI->getSrcTy() == TruncatedTy)
3959             return ZI->getOperand(0);
3960         return B.CreateZExtOrTrunc(V, TruncatedTy);
3961       };
3962 
3963       // The actual instruction modification depends on the instruction type,
3964       // unfortunately.
3965       Value *NewI = nullptr;
3966       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3967         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3968                              ShrinkOperand(BO->getOperand(1)));
3969 
3970         // Any wrapping introduced by shrinking this operation shouldn't be
3971         // considered undefined behavior. So, we can't unconditionally copy
3972         // arithmetic wrapping flags to NewI.
3973         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3974       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3975         NewI =
3976             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3977                          ShrinkOperand(CI->getOperand(1)));
3978       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3979         NewI = B.CreateSelect(SI->getCondition(),
3980                               ShrinkOperand(SI->getTrueValue()),
3981                               ShrinkOperand(SI->getFalseValue()));
3982       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3983         switch (CI->getOpcode()) {
3984         default:
3985           llvm_unreachable("Unhandled cast!");
3986         case Instruction::Trunc:
3987           NewI = ShrinkOperand(CI->getOperand(0));
3988           break;
3989         case Instruction::SExt:
3990           NewI = B.CreateSExtOrTrunc(
3991               CI->getOperand(0),
3992               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3993           break;
3994         case Instruction::ZExt:
3995           NewI = B.CreateZExtOrTrunc(
3996               CI->getOperand(0),
3997               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3998           break;
3999         }
4000       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
4001         auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType())
4002                              ->getNumElements();
4003         auto *O0 = B.CreateZExtOrTrunc(
4004             SI->getOperand(0),
4005             FixedVectorType::get(ScalarTruncatedTy, Elements0));
4006         auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType())
4007                              ->getNumElements();
4008         auto *O1 = B.CreateZExtOrTrunc(
4009             SI->getOperand(1),
4010             FixedVectorType::get(ScalarTruncatedTy, Elements1));
4011 
4012         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
4013       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
4014         // Don't do anything with the operands, just extend the result.
4015         continue;
4016       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
4017         auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType())
4018                             ->getNumElements();
4019         auto *O0 = B.CreateZExtOrTrunc(
4020             IE->getOperand(0),
4021             FixedVectorType::get(ScalarTruncatedTy, Elements));
4022         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
4023         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
4024       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
4025         auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType())
4026                             ->getNumElements();
4027         auto *O0 = B.CreateZExtOrTrunc(
4028             EE->getOperand(0),
4029             FixedVectorType::get(ScalarTruncatedTy, Elements));
4030         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
4031       } else {
4032         // If we don't know what to do, be conservative and don't do anything.
4033         continue;
4034       }
4035 
4036       // Lastly, extend the result.
4037       NewI->takeName(cast<Instruction>(I));
4038       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
4039       I->replaceAllUsesWith(Res);
4040       cast<Instruction>(I)->eraseFromParent();
4041       Erased.insert(I);
4042       State.reset(Def, Res, Part);
4043     }
4044   }
4045 
4046   // We'll have created a bunch of ZExts that are now parentless. Clean up.
4047   for (const auto &KV : Cost->getMinimalBitwidths()) {
4048     // If the value wasn't vectorized, we must maintain the original scalar
4049     // type. The absence of the value from State indicates that it
4050     // wasn't vectorized.
4051     VPValue *Def = State.Plan->getVPValue(KV.first);
4052     if (!State.hasAnyVectorValue(Def))
4053       continue;
4054     for (unsigned Part = 0; Part < UF; ++Part) {
4055       Value *I = State.get(Def, Part);
4056       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
4057       if (Inst && Inst->use_empty()) {
4058         Value *NewI = Inst->getOperand(0);
4059         Inst->eraseFromParent();
4060         State.reset(Def, NewI, Part);
4061       }
4062     }
4063   }
4064 }
4065 
4066 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
4067   // Insert truncates and extends for any truncated instructions as hints to
4068   // InstCombine.
4069   if (VF.isVector())
4070     truncateToMinimalBitwidths(State);
4071 
4072   // Fix widened non-induction PHIs by setting up the PHI operands.
4073   if (OrigPHIsToFix.size()) {
4074     assert(EnableVPlanNativePath &&
4075            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
4076     fixNonInductionPHIs(State);
4077   }
4078 
4079   // At this point every instruction in the original loop is widened to a
4080   // vector form. Now we need to fix the recurrences in the loop. These PHI
4081   // nodes are currently empty because we did not want to introduce cycles.
4082   // This is the second stage of vectorizing recurrences.
4083   fixCrossIterationPHIs(State);
4084 
4085   // Forget the original basic block.
4086   PSE.getSE()->forgetLoop(OrigLoop);
4087 
4088   // Fix-up external users of the induction variables.
4089   for (auto &Entry : Legal->getInductionVars())
4090     fixupIVUsers(Entry.first, Entry.second,
4091                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
4092                  IVEndValues[Entry.first], LoopMiddleBlock);
4093 
4094   fixLCSSAPHIs(State);
4095   for (Instruction *PI : PredicatedInstructions)
4096     sinkScalarOperands(&*PI);
4097 
4098   // Remove redundant induction instructions.
4099   cse(LoopVectorBody);
4100 
4101   // Set/update profile weights for the vector and remainder loops as original
4102   // loop iterations are now distributed among them. Note that original loop
4103   // represented by LoopScalarBody becomes remainder loop after vectorization.
4104   //
4105   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
4106   // end up getting slightly roughened result but that should be OK since
4107   // profile is not inherently precise anyway. Note also possible bypass of
4108   // vector code caused by legality checks is ignored, assigning all the weight
4109   // to the vector loop, optimistically.
4110   //
4111   // For scalable vectorization we can't know at compile time how many iterations
4112   // of the loop are handled in one vector iteration, so instead assume a pessimistic
4113   // vscale of '1'.
4114   setProfileInfoAfterUnrolling(
4115       LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
4116       LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
4117 }
4118 
4119 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
4120   // In order to support recurrences we need to be able to vectorize Phi nodes.
4121   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4122   // stage #2: We now need to fix the recurrences by adding incoming edges to
4123   // the currently empty PHI nodes. At this point every instruction in the
4124   // original loop is widened to a vector form so we can use them to construct
4125   // the incoming edges.
4126   VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock();
4127   for (VPRecipeBase &R : Header->phis()) {
4128     auto *PhiR = dyn_cast<VPWidenPHIRecipe>(&R);
4129     if (!PhiR)
4130       continue;
4131     auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
4132     if (PhiR->getRecurrenceDescriptor()) {
4133       fixReduction(PhiR, State);
4134     } else if (Legal->isFirstOrderRecurrence(OrigPhi))
4135       fixFirstOrderRecurrence(PhiR, State);
4136   }
4137 }
4138 
4139 void InnerLoopVectorizer::fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR,
4140                                                   VPTransformState &State) {
4141   // This is the second phase of vectorizing first-order recurrences. An
4142   // overview of the transformation is described below. Suppose we have the
4143   // following loop.
4144   //
4145   //   for (int i = 0; i < n; ++i)
4146   //     b[i] = a[i] - a[i - 1];
4147   //
4148   // There is a first-order recurrence on "a". For this loop, the shorthand
4149   // scalar IR looks like:
4150   //
4151   //   scalar.ph:
4152   //     s_init = a[-1]
4153   //     br scalar.body
4154   //
4155   //   scalar.body:
4156   //     i = phi [0, scalar.ph], [i+1, scalar.body]
4157   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
4158   //     s2 = a[i]
4159   //     b[i] = s2 - s1
4160   //     br cond, scalar.body, ...
4161   //
4162   // In this example, s1 is a recurrence because it's value depends on the
4163   // previous iteration. In the first phase of vectorization, we created a
4164   // temporary value for s1. We now complete the vectorization and produce the
4165   // shorthand vector IR shown below (for VF = 4, UF = 1).
4166   //
4167   //   vector.ph:
4168   //     v_init = vector(..., ..., ..., a[-1])
4169   //     br vector.body
4170   //
4171   //   vector.body
4172   //     i = phi [0, vector.ph], [i+4, vector.body]
4173   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
4174   //     v2 = a[i, i+1, i+2, i+3];
4175   //     v3 = vector(v1(3), v2(0, 1, 2))
4176   //     b[i, i+1, i+2, i+3] = v2 - v3
4177   //     br cond, vector.body, middle.block
4178   //
4179   //   middle.block:
4180   //     x = v2(3)
4181   //     br scalar.ph
4182   //
4183   //   scalar.ph:
4184   //     s_init = phi [x, middle.block], [a[-1], otherwise]
4185   //     br scalar.body
4186   //
4187   // After execution completes the vector loop, we extract the next value of
4188   // the recurrence (x) to use as the initial value in the scalar loop.
4189 
4190   auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
4191 
4192   auto *IdxTy = Builder.getInt32Ty();
4193   auto *One = ConstantInt::get(IdxTy, 1);
4194 
4195   // Create a vector from the initial value.
4196   auto *VectorInit = ScalarInit;
4197   if (VF.isVector()) {
4198     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4199     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
4200     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
4201     VectorInit = Builder.CreateInsertElement(
4202         PoisonValue::get(VectorType::get(VectorInit->getType(), VF)),
4203         VectorInit, LastIdx, "vector.recur.init");
4204   }
4205 
4206   VPValue *PreviousDef = PhiR->getBackedgeValue();
4207   // We constructed a temporary phi node in the first phase of vectorization.
4208   // This phi node will eventually be deleted.
4209   Builder.SetInsertPoint(cast<Instruction>(State.get(PhiR, 0)));
4210 
4211   // Create a phi node for the new recurrence. The current value will either be
4212   // the initial value inserted into a vector or loop-varying vector value.
4213   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
4214   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
4215 
4216   // Get the vectorized previous value of the last part UF - 1. It appears last
4217   // among all unrolled iterations, due to the order of their construction.
4218   Value *PreviousLastPart = State.get(PreviousDef, UF - 1);
4219 
4220   // Find and set the insertion point after the previous value if it is an
4221   // instruction.
4222   BasicBlock::iterator InsertPt;
4223   // Note that the previous value may have been constant-folded so it is not
4224   // guaranteed to be an instruction in the vector loop.
4225   // FIXME: Loop invariant values do not form recurrences. We should deal with
4226   //        them earlier.
4227   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
4228     InsertPt = LoopVectorBody->getFirstInsertionPt();
4229   else {
4230     Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
4231     if (isa<PHINode>(PreviousLastPart))
4232       // If the previous value is a phi node, we should insert after all the phi
4233       // nodes in the block containing the PHI to avoid breaking basic block
4234       // verification. Note that the basic block may be different to
4235       // LoopVectorBody, in case we predicate the loop.
4236       InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
4237     else
4238       InsertPt = ++PreviousInst->getIterator();
4239   }
4240   Builder.SetInsertPoint(&*InsertPt);
4241 
4242   // The vector from which to take the initial value for the current iteration
4243   // (actual or unrolled). Initially, this is the vector phi node.
4244   Value *Incoming = VecPhi;
4245 
4246   // Shuffle the current and previous vector and update the vector parts.
4247   for (unsigned Part = 0; Part < UF; ++Part) {
4248     Value *PreviousPart = State.get(PreviousDef, Part);
4249     Value *PhiPart = State.get(PhiR, Part);
4250     auto *Shuffle = VF.isVector()
4251                         ? Builder.CreateVectorSplice(Incoming, PreviousPart, -1)
4252                         : Incoming;
4253     PhiPart->replaceAllUsesWith(Shuffle);
4254     cast<Instruction>(PhiPart)->eraseFromParent();
4255     State.reset(PhiR, Shuffle, Part);
4256     Incoming = PreviousPart;
4257   }
4258 
4259   // Fix the latch value of the new recurrence in the vector loop.
4260   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4261 
4262   // Extract the last vector element in the middle block. This will be the
4263   // initial value for the recurrence when jumping to the scalar loop.
4264   auto *ExtractForScalar = Incoming;
4265   if (VF.isVector()) {
4266     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4267     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
4268     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
4269     ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx,
4270                                                     "vector.recur.extract");
4271   }
4272   // Extract the second last element in the middle block if the
4273   // Phi is used outside the loop. We need to extract the phi itself
4274   // and not the last element (the phi update in the current iteration). This
4275   // will be the value when jumping to the exit block from the LoopMiddleBlock,
4276   // when the scalar loop is not run at all.
4277   Value *ExtractForPhiUsedOutsideLoop = nullptr;
4278   if (VF.isVector()) {
4279     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
4280     auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
4281     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
4282         Incoming, Idx, "vector.recur.extract.for.phi");
4283   } else if (UF > 1)
4284     // When loop is unrolled without vectorizing, initialize
4285     // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value
4286     // of `Incoming`. This is analogous to the vectorized case above: extracting
4287     // the second last element when VF > 1.
4288     ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
4289 
4290   // Fix the initial value of the original recurrence in the scalar loop.
4291   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
4292   PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
4293   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
4294   for (auto *BB : predecessors(LoopScalarPreHeader)) {
4295     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
4296     Start->addIncoming(Incoming, BB);
4297   }
4298 
4299   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
4300   Phi->setName("scalar.recur");
4301 
4302   // Finally, fix users of the recurrence outside the loop. The users will need
4303   // either the last value of the scalar recurrence or the last value of the
4304   // vector recurrence we extracted in the middle block. Since the loop is in
4305   // LCSSA form, we just need to find all the phi nodes for the original scalar
4306   // recurrence in the exit block, and then add an edge for the middle block.
4307   // Note that LCSSA does not imply single entry when the original scalar loop
4308   // had multiple exiting edges (as we always run the last iteration in the
4309   // scalar epilogue); in that case, the exiting path through middle will be
4310   // dynamically dead and the value picked for the phi doesn't matter.
4311   for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4312     if (any_of(LCSSAPhi.incoming_values(),
4313                [Phi](Value *V) { return V == Phi; }))
4314       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
4315 }
4316 
4317 void InnerLoopVectorizer::fixReduction(VPWidenPHIRecipe *PhiR,
4318                                        VPTransformState &State) {
4319   PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
4320   // Get it's reduction variable descriptor.
4321   assert(Legal->isReductionVariable(OrigPhi) &&
4322          "Unable to find the reduction variable");
4323   const RecurrenceDescriptor &RdxDesc = *PhiR->getRecurrenceDescriptor();
4324 
4325   RecurKind RK = RdxDesc.getRecurrenceKind();
4326   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
4327   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
4328   setDebugLocFromInst(ReductionStartValue);
4329   bool IsInLoopReductionPhi = Cost->isInLoopReduction(OrigPhi);
4330 
4331   VPValue *LoopExitInstDef = State.Plan->getVPValue(LoopExitInst);
4332   // This is the vector-clone of the value that leaves the loop.
4333   Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
4334 
4335   // Wrap flags are in general invalid after vectorization, clear them.
4336   clearReductionWrapFlags(RdxDesc, State);
4337 
4338   // Fix the vector-loop phi.
4339 
4340   // Reductions do not have to start at zero. They can start with
4341   // any loop invariant values.
4342   BasicBlock *VectorLoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4343 
4344   bool IsOrdered = IsInLoopReductionPhi && Cost->useOrderedReductions(RdxDesc);
4345 
4346   for (unsigned Part = 0; Part < UF; ++Part) {
4347     if (IsOrdered && Part > 0)
4348       break;
4349     Value *VecRdxPhi = State.get(PhiR->getVPSingleValue(), Part);
4350     Value *Val = State.get(PhiR->getBackedgeValue(), Part);
4351     if (IsOrdered)
4352       Val = State.get(PhiR->getBackedgeValue(), UF - 1);
4353 
4354     cast<PHINode>(VecRdxPhi)->addIncoming(Val, VectorLoopLatch);
4355   }
4356 
4357   // Before each round, move the insertion point right between
4358   // the PHIs and the values we are going to write.
4359   // This allows us to write both PHINodes and the extractelement
4360   // instructions.
4361   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4362 
4363   setDebugLocFromInst(LoopExitInst);
4364 
4365   Type *PhiTy = OrigPhi->getType();
4366   // If tail is folded by masking, the vector value to leave the loop should be
4367   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
4368   // instead of the former. For an inloop reduction the reduction will already
4369   // be predicated, and does not need to be handled here.
4370   if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) {
4371     for (unsigned Part = 0; Part < UF; ++Part) {
4372       Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
4373       Value *Sel = nullptr;
4374       for (User *U : VecLoopExitInst->users()) {
4375         if (isa<SelectInst>(U)) {
4376           assert(!Sel && "Reduction exit feeding two selects");
4377           Sel = U;
4378         } else
4379           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
4380       }
4381       assert(Sel && "Reduction exit feeds no select");
4382       State.reset(LoopExitInstDef, Sel, Part);
4383 
4384       // If the target can create a predicated operator for the reduction at no
4385       // extra cost in the loop (for example a predicated vadd), it can be
4386       // cheaper for the select to remain in the loop than be sunk out of it,
4387       // and so use the select value for the phi instead of the old
4388       // LoopExitValue.
4389       if (PreferPredicatedReductionSelect ||
4390           TTI->preferPredicatedReductionSelect(
4391               RdxDesc.getOpcode(), PhiTy,
4392               TargetTransformInfo::ReductionFlags())) {
4393         auto *VecRdxPhi =
4394             cast<PHINode>(State.get(PhiR->getVPSingleValue(), Part));
4395         VecRdxPhi->setIncomingValueForBlock(
4396             LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
4397       }
4398     }
4399   }
4400 
4401   // If the vector reduction can be performed in a smaller type, we truncate
4402   // then extend the loop exit value to enable InstCombine to evaluate the
4403   // entire expression in the smaller type.
4404   if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
4405     assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!");
4406     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4407     Builder.SetInsertPoint(
4408         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
4409     VectorParts RdxParts(UF);
4410     for (unsigned Part = 0; Part < UF; ++Part) {
4411       RdxParts[Part] = State.get(LoopExitInstDef, Part);
4412       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4413       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4414                                         : Builder.CreateZExt(Trunc, VecTy);
4415       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
4416            UI != RdxParts[Part]->user_end();)
4417         if (*UI != Trunc) {
4418           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
4419           RdxParts[Part] = Extnd;
4420         } else {
4421           ++UI;
4422         }
4423     }
4424     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4425     for (unsigned Part = 0; Part < UF; ++Part) {
4426       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4427       State.reset(LoopExitInstDef, RdxParts[Part], Part);
4428     }
4429   }
4430 
4431   // Reduce all of the unrolled parts into a single vector.
4432   Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
4433   unsigned Op = RecurrenceDescriptor::getOpcode(RK);
4434 
4435   // The middle block terminator has already been assigned a DebugLoc here (the
4436   // OrigLoop's single latch terminator). We want the whole middle block to
4437   // appear to execute on this line because: (a) it is all compiler generated,
4438   // (b) these instructions are always executed after evaluating the latch
4439   // conditional branch, and (c) other passes may add new predecessors which
4440   // terminate on this line. This is the easiest way to ensure we don't
4441   // accidentally cause an extra step back into the loop while debugging.
4442   setDebugLocFromInst(LoopMiddleBlock->getTerminator());
4443   if (IsOrdered)
4444     ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
4445   else {
4446     // Floating-point operations should have some FMF to enable the reduction.
4447     IRBuilderBase::FastMathFlagGuard FMFG(Builder);
4448     Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
4449     for (unsigned Part = 1; Part < UF; ++Part) {
4450       Value *RdxPart = State.get(LoopExitInstDef, Part);
4451       if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
4452         ReducedPartRdx = Builder.CreateBinOp(
4453             (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
4454       } else {
4455         ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
4456       }
4457     }
4458   }
4459 
4460   // Create the reduction after the loop. Note that inloop reductions create the
4461   // target reduction in the loop using a Reduction recipe.
4462   if (VF.isVector() && !IsInLoopReductionPhi) {
4463     ReducedPartRdx =
4464         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx);
4465     // If the reduction can be performed in a smaller type, we need to extend
4466     // the reduction to the wider type before we branch to the original loop.
4467     if (PhiTy != RdxDesc.getRecurrenceType())
4468       ReducedPartRdx = RdxDesc.isSigned()
4469                            ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
4470                            : Builder.CreateZExt(ReducedPartRdx, PhiTy);
4471   }
4472 
4473   // Create a phi node that merges control-flow from the backedge-taken check
4474   // block and the middle block.
4475   PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
4476                                         LoopScalarPreHeader->getTerminator());
4477   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4478     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4479   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4480 
4481   // Now, we need to fix the users of the reduction variable
4482   // inside and outside of the scalar remainder loop.
4483 
4484   // We know that the loop is in LCSSA form. We need to update the PHI nodes
4485   // in the exit blocks.  See comment on analogous loop in
4486   // fixFirstOrderRecurrence for a more complete explaination of the logic.
4487   for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4488     if (any_of(LCSSAPhi.incoming_values(),
4489                [LoopExitInst](Value *V) { return V == LoopExitInst; }))
4490       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4491 
4492   // Fix the scalar loop reduction variable with the incoming reduction sum
4493   // from the vector body and from the backedge value.
4494   int IncomingEdgeBlockIdx =
4495       OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4496   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4497   // Pick the other block.
4498   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4499   OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4500   OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4501 }
4502 
4503 void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
4504                                                   VPTransformState &State) {
4505   RecurKind RK = RdxDesc.getRecurrenceKind();
4506   if (RK != RecurKind::Add && RK != RecurKind::Mul)
4507     return;
4508 
4509   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4510   assert(LoopExitInstr && "null loop exit instruction");
4511   SmallVector<Instruction *, 8> Worklist;
4512   SmallPtrSet<Instruction *, 8> Visited;
4513   Worklist.push_back(LoopExitInstr);
4514   Visited.insert(LoopExitInstr);
4515 
4516   while (!Worklist.empty()) {
4517     Instruction *Cur = Worklist.pop_back_val();
4518     if (isa<OverflowingBinaryOperator>(Cur))
4519       for (unsigned Part = 0; Part < UF; ++Part) {
4520         Value *V = State.get(State.Plan->getVPValue(Cur), Part);
4521         cast<Instruction>(V)->dropPoisonGeneratingFlags();
4522       }
4523 
4524     for (User *U : Cur->users()) {
4525       Instruction *UI = cast<Instruction>(U);
4526       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4527           Visited.insert(UI).second)
4528         Worklist.push_back(UI);
4529     }
4530   }
4531 }
4532 
4533 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) {
4534   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4535     if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1)
4536       // Some phis were already hand updated by the reduction and recurrence
4537       // code above, leave them alone.
4538       continue;
4539 
4540     auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4541     // Non-instruction incoming values will have only one value.
4542 
4543     VPLane Lane = VPLane::getFirstLane();
4544     if (isa<Instruction>(IncomingValue) &&
4545         !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue),
4546                                            VF))
4547       Lane = VPLane::getLastLaneForVF(VF);
4548 
4549     // Can be a loop invariant incoming value or the last scalar value to be
4550     // extracted from the vectorized loop.
4551     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4552     Value *lastIncomingValue =
4553         OrigLoop->isLoopInvariant(IncomingValue)
4554             ? IncomingValue
4555             : State.get(State.Plan->getVPValue(IncomingValue),
4556                         VPIteration(UF - 1, Lane));
4557     LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4558   }
4559 }
4560 
4561 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4562   // The basic block and loop containing the predicated instruction.
4563   auto *PredBB = PredInst->getParent();
4564   auto *VectorLoop = LI->getLoopFor(PredBB);
4565 
4566   // Initialize a worklist with the operands of the predicated instruction.
4567   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4568 
4569   // Holds instructions that we need to analyze again. An instruction may be
4570   // reanalyzed if we don't yet know if we can sink it or not.
4571   SmallVector<Instruction *, 8> InstsToReanalyze;
4572 
4573   // Returns true if a given use occurs in the predicated block. Phi nodes use
4574   // their operands in their corresponding predecessor blocks.
4575   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4576     auto *I = cast<Instruction>(U.getUser());
4577     BasicBlock *BB = I->getParent();
4578     if (auto *Phi = dyn_cast<PHINode>(I))
4579       BB = Phi->getIncomingBlock(
4580           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4581     return BB == PredBB;
4582   };
4583 
4584   // Iteratively sink the scalarized operands of the predicated instruction
4585   // into the block we created for it. When an instruction is sunk, it's
4586   // operands are then added to the worklist. The algorithm ends after one pass
4587   // through the worklist doesn't sink a single instruction.
4588   bool Changed;
4589   do {
4590     // Add the instructions that need to be reanalyzed to the worklist, and
4591     // reset the changed indicator.
4592     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4593     InstsToReanalyze.clear();
4594     Changed = false;
4595 
4596     while (!Worklist.empty()) {
4597       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4598 
4599       // We can't sink an instruction if it is a phi node, is not in the loop,
4600       // or may have side effects.
4601       if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
4602           I->mayHaveSideEffects())
4603         continue;
4604 
4605       // If the instruction is already in PredBB, check if we can sink its
4606       // operands. In that case, VPlan's sinkScalarOperands() succeeded in
4607       // sinking the scalar instruction I, hence it appears in PredBB; but it
4608       // may have failed to sink I's operands (recursively), which we try
4609       // (again) here.
4610       if (I->getParent() == PredBB) {
4611         Worklist.insert(I->op_begin(), I->op_end());
4612         continue;
4613       }
4614 
4615       // It's legal to sink the instruction if all its uses occur in the
4616       // predicated block. Otherwise, there's nothing to do yet, and we may
4617       // need to reanalyze the instruction.
4618       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4619         InstsToReanalyze.push_back(I);
4620         continue;
4621       }
4622 
4623       // Move the instruction to the beginning of the predicated block, and add
4624       // it's operands to the worklist.
4625       I->moveBefore(&*PredBB->getFirstInsertionPt());
4626       Worklist.insert(I->op_begin(), I->op_end());
4627 
4628       // The sinking may have enabled other instructions to be sunk, so we will
4629       // need to iterate.
4630       Changed = true;
4631     }
4632   } while (Changed);
4633 }
4634 
4635 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
4636   for (PHINode *OrigPhi : OrigPHIsToFix) {
4637     VPWidenPHIRecipe *VPPhi =
4638         cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi));
4639     PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
4640     // Make sure the builder has a valid insert point.
4641     Builder.SetInsertPoint(NewPhi);
4642     for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
4643       VPValue *Inc = VPPhi->getIncomingValue(i);
4644       VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
4645       NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
4646     }
4647   }
4648 }
4649 
4650 bool InnerLoopVectorizer::useOrderedReductions(RecurrenceDescriptor &RdxDesc) {
4651   return Cost->useOrderedReductions(RdxDesc);
4652 }
4653 
4654 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef,
4655                                    VPUser &Operands, unsigned UF,
4656                                    ElementCount VF, bool IsPtrLoopInvariant,
4657                                    SmallBitVector &IsIndexLoopInvariant,
4658                                    VPTransformState &State) {
4659   // Construct a vector GEP by widening the operands of the scalar GEP as
4660   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4661   // results in a vector of pointers when at least one operand of the GEP
4662   // is vector-typed. Thus, to keep the representation compact, we only use
4663   // vector-typed operands for loop-varying values.
4664 
4665   if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4666     // If we are vectorizing, but the GEP has only loop-invariant operands,
4667     // the GEP we build (by only using vector-typed operands for
4668     // loop-varying values) would be a scalar pointer. Thus, to ensure we
4669     // produce a vector of pointers, we need to either arbitrarily pick an
4670     // operand to broadcast, or broadcast a clone of the original GEP.
4671     // Here, we broadcast a clone of the original.
4672     //
4673     // TODO: If at some point we decide to scalarize instructions having
4674     //       loop-invariant operands, this special case will no longer be
4675     //       required. We would add the scalarization decision to
4676     //       collectLoopScalars() and teach getVectorValue() to broadcast
4677     //       the lane-zero scalar value.
4678     auto *Clone = Builder.Insert(GEP->clone());
4679     for (unsigned Part = 0; Part < UF; ++Part) {
4680       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4681       State.set(VPDef, EntryPart, Part);
4682       addMetadata(EntryPart, GEP);
4683     }
4684   } else {
4685     // If the GEP has at least one loop-varying operand, we are sure to
4686     // produce a vector of pointers. But if we are only unrolling, we want
4687     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4688     // produce with the code below will be scalar (if VF == 1) or vector
4689     // (otherwise). Note that for the unroll-only case, we still maintain
4690     // values in the vector mapping with initVector, as we do for other
4691     // instructions.
4692     for (unsigned Part = 0; Part < UF; ++Part) {
4693       // The pointer operand of the new GEP. If it's loop-invariant, we
4694       // won't broadcast it.
4695       auto *Ptr = IsPtrLoopInvariant
4696                       ? State.get(Operands.getOperand(0), VPIteration(0, 0))
4697                       : State.get(Operands.getOperand(0), Part);
4698 
4699       // Collect all the indices for the new GEP. If any index is
4700       // loop-invariant, we won't broadcast it.
4701       SmallVector<Value *, 4> Indices;
4702       for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
4703         VPValue *Operand = Operands.getOperand(I);
4704         if (IsIndexLoopInvariant[I - 1])
4705           Indices.push_back(State.get(Operand, VPIteration(0, 0)));
4706         else
4707           Indices.push_back(State.get(Operand, Part));
4708       }
4709 
4710       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4711       // but it should be a vector, otherwise.
4712       auto *NewGEP =
4713           GEP->isInBounds()
4714               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4715                                           Indices)
4716               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4717       assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
4718              "NewGEP is not a pointer vector");
4719       State.set(VPDef, NewGEP, Part);
4720       addMetadata(NewGEP, GEP);
4721     }
4722   }
4723 }
4724 
4725 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
4726                                               RecurrenceDescriptor *RdxDesc,
4727                                               VPWidenPHIRecipe *PhiR,
4728                                               VPTransformState &State) {
4729   PHINode *P = cast<PHINode>(PN);
4730   if (EnableVPlanNativePath) {
4731     // Currently we enter here in the VPlan-native path for non-induction
4732     // PHIs where all control flow is uniform. We simply widen these PHIs.
4733     // Create a vector phi with no operands - the vector phi operands will be
4734     // set at the end of vector code generation.
4735     Type *VecTy = (State.VF.isScalar())
4736                       ? PN->getType()
4737                       : VectorType::get(PN->getType(), State.VF);
4738     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4739     State.set(PhiR, VecPhi, 0);
4740     OrigPHIsToFix.push_back(P);
4741 
4742     return;
4743   }
4744 
4745   assert(PN->getParent() == OrigLoop->getHeader() &&
4746          "Non-header phis should have been handled elsewhere");
4747 
4748   // In order to support recurrences we need to be able to vectorize Phi nodes.
4749   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4750   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4751   // this value when we vectorize all of the instructions that use the PHI.
4752   if (RdxDesc || Legal->isFirstOrderRecurrence(P)) {
4753     bool ScalarPHI =
4754         (State.VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN));
4755     Type *VecTy =
4756         ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF);
4757 
4758     bool IsOrdered = Cost->isInLoopReduction(cast<PHINode>(PN)) &&
4759                      Cost->useOrderedReductions(*RdxDesc);
4760     unsigned LastPartForNewPhi = IsOrdered ? 1 : State.UF;
4761     for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
4762       Value *EntryPart = PHINode::Create(
4763           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4764       State.set(PhiR, EntryPart, Part);
4765     }
4766     if (Legal->isFirstOrderRecurrence(P))
4767       return;
4768     VPValue *StartVPV = PhiR->getStartValue();
4769     Value *StartV = StartVPV->getLiveInIRValue();
4770 
4771     Value *Iden = nullptr;
4772 
4773     assert(Legal->isReductionVariable(P) && StartV &&
4774            "RdxDesc should only be set for reduction variables; in that case "
4775            "a StartV is also required");
4776     RecurKind RK = RdxDesc->getRecurrenceKind();
4777     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) {
4778       // MinMax reduction have the start value as their identify.
4779       if (ScalarPHI) {
4780         Iden = StartV;
4781       } else {
4782         IRBuilderBase::InsertPointGuard IPBuilder(Builder);
4783         Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4784         StartV = Iden =
4785             Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident");
4786       }
4787     } else {
4788       Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity(
4789           RK, VecTy->getScalarType(), RdxDesc->getFastMathFlags());
4790       Iden = IdenC;
4791 
4792       if (!ScalarPHI) {
4793         Iden = ConstantVector::getSplat(State.VF, IdenC);
4794         IRBuilderBase::InsertPointGuard IPBuilder(Builder);
4795         Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4796         Constant *Zero = Builder.getInt32(0);
4797         StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
4798       }
4799     }
4800 
4801     for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
4802       Value *EntryPart = State.get(PhiR, Part);
4803       // Make sure to add the reduction start value only to the
4804       // first unroll part.
4805       Value *StartVal = (Part == 0) ? StartV : Iden;
4806       cast<PHINode>(EntryPart)->addIncoming(StartVal, LoopVectorPreHeader);
4807     }
4808 
4809     return;
4810   }
4811 
4812   assert(!Legal->isReductionVariable(P) &&
4813          "reductions should be handled above");
4814 
4815   setDebugLocFromInst(P);
4816 
4817   // This PHINode must be an induction variable.
4818   // Make sure that we know about it.
4819   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4820 
4821   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4822   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4823 
4824   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4825   // which can be found from the original scalar operations.
4826   switch (II.getKind()) {
4827   case InductionDescriptor::IK_NoInduction:
4828     llvm_unreachable("Unknown induction");
4829   case InductionDescriptor::IK_IntInduction:
4830   case InductionDescriptor::IK_FpInduction:
4831     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4832   case InductionDescriptor::IK_PtrInduction: {
4833     // Handle the pointer induction variable case.
4834     assert(P->getType()->isPointerTy() && "Unexpected type.");
4835 
4836     if (Cost->isScalarAfterVectorization(P, State.VF)) {
4837       // This is the normalized GEP that starts counting at zero.
4838       Value *PtrInd =
4839           Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
4840       // Determine the number of scalars we need to generate for each unroll
4841       // iteration. If the instruction is uniform, we only need to generate the
4842       // first lane. Otherwise, we generate all VF values.
4843       bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF);
4844       unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue();
4845 
4846       bool NeedsVectorIndex = !IsUniform && VF.isScalable();
4847       Value *UnitStepVec = nullptr, *PtrIndSplat = nullptr;
4848       if (NeedsVectorIndex) {
4849         Type *VecIVTy = VectorType::get(PtrInd->getType(), VF);
4850         UnitStepVec = Builder.CreateStepVector(VecIVTy);
4851         PtrIndSplat = Builder.CreateVectorSplat(VF, PtrInd);
4852       }
4853 
4854       for (unsigned Part = 0; Part < UF; ++Part) {
4855         Value *PartStart = createStepForVF(
4856             Builder, ConstantInt::get(PtrInd->getType(), Part), VF);
4857 
4858         if (NeedsVectorIndex) {
4859           Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart);
4860           Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec);
4861           Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices);
4862           Value *SclrGep =
4863               emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II);
4864           SclrGep->setName("next.gep");
4865           State.set(PhiR, SclrGep, Part);
4866           // We've cached the whole vector, which means we can support the
4867           // extraction of any lane.
4868           continue;
4869         }
4870 
4871         for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4872           Value *Idx = Builder.CreateAdd(
4873               PartStart, ConstantInt::get(PtrInd->getType(), Lane));
4874           Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4875           Value *SclrGep =
4876               emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4877           SclrGep->setName("next.gep");
4878           State.set(PhiR, SclrGep, VPIteration(Part, Lane));
4879         }
4880       }
4881       return;
4882     }
4883     assert(isa<SCEVConstant>(II.getStep()) &&
4884            "Induction step not a SCEV constant!");
4885     Type *PhiType = II.getStep()->getType();
4886 
4887     // Build a pointer phi
4888     Value *ScalarStartValue = II.getStartValue();
4889     Type *ScStValueType = ScalarStartValue->getType();
4890     PHINode *NewPointerPhi =
4891         PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
4892     NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4893 
4894     // A pointer induction, performed by using a gep
4895     BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4896     Instruction *InductionLoc = LoopLatch->getTerminator();
4897     const SCEV *ScalarStep = II.getStep();
4898     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4899     Value *ScalarStepValue =
4900         Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4901     Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF);
4902     Value *NumUnrolledElems =
4903         Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
4904     Value *InductionGEP = GetElementPtrInst::Create(
4905         ScStValueType->getPointerElementType(), NewPointerPhi,
4906         Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
4907         InductionLoc);
4908     NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4909 
4910     // Create UF many actual address geps that use the pointer
4911     // phi as base and a vectorized version of the step value
4912     // (<step*0, ..., step*N>) as offset.
4913     for (unsigned Part = 0; Part < State.UF; ++Part) {
4914       Type *VecPhiType = VectorType::get(PhiType, State.VF);
4915       Value *StartOffsetScalar =
4916           Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
4917       Value *StartOffset =
4918           Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
4919       // Create a vector of consecutive numbers from zero to VF.
4920       StartOffset =
4921           Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType));
4922 
4923       Value *GEP = Builder.CreateGEP(
4924           ScStValueType->getPointerElementType(), NewPointerPhi,
4925           Builder.CreateMul(
4926               StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue),
4927               "vector.gep"));
4928       State.set(PhiR, GEP, Part);
4929     }
4930   }
4931   }
4932 }
4933 
4934 /// A helper function for checking whether an integer division-related
4935 /// instruction may divide by zero (in which case it must be predicated if
4936 /// executed conditionally in the scalar code).
4937 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4938 /// Non-zero divisors that are non compile-time constants will not be
4939 /// converted into multiplication, so we will still end up scalarizing
4940 /// the division, but can do so w/o predication.
4941 static bool mayDivideByZero(Instruction &I) {
4942   assert((I.getOpcode() == Instruction::UDiv ||
4943           I.getOpcode() == Instruction::SDiv ||
4944           I.getOpcode() == Instruction::URem ||
4945           I.getOpcode() == Instruction::SRem) &&
4946          "Unexpected instruction");
4947   Value *Divisor = I.getOperand(1);
4948   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4949   return !CInt || CInt->isZero();
4950 }
4951 
4952 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
4953                                            VPUser &User,
4954                                            VPTransformState &State) {
4955   switch (I.getOpcode()) {
4956   case Instruction::Call:
4957   case Instruction::Br:
4958   case Instruction::PHI:
4959   case Instruction::GetElementPtr:
4960   case Instruction::Select:
4961     llvm_unreachable("This instruction is handled by a different recipe.");
4962   case Instruction::UDiv:
4963   case Instruction::SDiv:
4964   case Instruction::SRem:
4965   case Instruction::URem:
4966   case Instruction::Add:
4967   case Instruction::FAdd:
4968   case Instruction::Sub:
4969   case Instruction::FSub:
4970   case Instruction::FNeg:
4971   case Instruction::Mul:
4972   case Instruction::FMul:
4973   case Instruction::FDiv:
4974   case Instruction::FRem:
4975   case Instruction::Shl:
4976   case Instruction::LShr:
4977   case Instruction::AShr:
4978   case Instruction::And:
4979   case Instruction::Or:
4980   case Instruction::Xor: {
4981     // Just widen unops and binops.
4982     setDebugLocFromInst(&I);
4983 
4984     for (unsigned Part = 0; Part < UF; ++Part) {
4985       SmallVector<Value *, 2> Ops;
4986       for (VPValue *VPOp : User.operands())
4987         Ops.push_back(State.get(VPOp, Part));
4988 
4989       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4990 
4991       if (auto *VecOp = dyn_cast<Instruction>(V))
4992         VecOp->copyIRFlags(&I);
4993 
4994       // Use this vector value for all users of the original instruction.
4995       State.set(Def, V, Part);
4996       addMetadata(V, &I);
4997     }
4998 
4999     break;
5000   }
5001   case Instruction::ICmp:
5002   case Instruction::FCmp: {
5003     // Widen compares. Generate vector compares.
5004     bool FCmp = (I.getOpcode() == Instruction::FCmp);
5005     auto *Cmp = cast<CmpInst>(&I);
5006     setDebugLocFromInst(Cmp);
5007     for (unsigned Part = 0; Part < UF; ++Part) {
5008       Value *A = State.get(User.getOperand(0), Part);
5009       Value *B = State.get(User.getOperand(1), Part);
5010       Value *C = nullptr;
5011       if (FCmp) {
5012         // Propagate fast math flags.
5013         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
5014         Builder.setFastMathFlags(Cmp->getFastMathFlags());
5015         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
5016       } else {
5017         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
5018       }
5019       State.set(Def, C, Part);
5020       addMetadata(C, &I);
5021     }
5022 
5023     break;
5024   }
5025 
5026   case Instruction::ZExt:
5027   case Instruction::SExt:
5028   case Instruction::FPToUI:
5029   case Instruction::FPToSI:
5030   case Instruction::FPExt:
5031   case Instruction::PtrToInt:
5032   case Instruction::IntToPtr:
5033   case Instruction::SIToFP:
5034   case Instruction::UIToFP:
5035   case Instruction::Trunc:
5036   case Instruction::FPTrunc:
5037   case Instruction::BitCast: {
5038     auto *CI = cast<CastInst>(&I);
5039     setDebugLocFromInst(CI);
5040 
5041     /// Vectorize casts.
5042     Type *DestTy =
5043         (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF);
5044 
5045     for (unsigned Part = 0; Part < UF; ++Part) {
5046       Value *A = State.get(User.getOperand(0), Part);
5047       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
5048       State.set(Def, Cast, Part);
5049       addMetadata(Cast, &I);
5050     }
5051     break;
5052   }
5053   default:
5054     // This instruction is not vectorized by simple widening.
5055     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
5056     llvm_unreachable("Unhandled instruction!");
5057   } // end of switch.
5058 }
5059 
5060 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
5061                                                VPUser &ArgOperands,
5062                                                VPTransformState &State) {
5063   assert(!isa<DbgInfoIntrinsic>(I) &&
5064          "DbgInfoIntrinsic should have been dropped during VPlan construction");
5065   setDebugLocFromInst(&I);
5066 
5067   Module *M = I.getParent()->getParent()->getParent();
5068   auto *CI = cast<CallInst>(&I);
5069 
5070   SmallVector<Type *, 4> Tys;
5071   for (Value *ArgOperand : CI->arg_operands())
5072     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
5073 
5074   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
5075 
5076   // The flag shows whether we use Intrinsic or a usual Call for vectorized
5077   // version of the instruction.
5078   // Is it beneficial to perform intrinsic call compared to lib call?
5079   bool NeedToScalarize = false;
5080   InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
5081   InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0;
5082   bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
5083   assert((UseVectorIntrinsic || !NeedToScalarize) &&
5084          "Instruction should be scalarized elsewhere.");
5085   assert((IntrinsicCost.isValid() || CallCost.isValid()) &&
5086          "Either the intrinsic cost or vector call cost must be valid");
5087 
5088   for (unsigned Part = 0; Part < UF; ++Part) {
5089     SmallVector<Type *, 2> TysForDecl = {CI->getType()};
5090     SmallVector<Value *, 4> Args;
5091     for (auto &I : enumerate(ArgOperands.operands())) {
5092       // Some intrinsics have a scalar argument - don't replace it with a
5093       // vector.
5094       Value *Arg;
5095       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
5096         Arg = State.get(I.value(), Part);
5097       else {
5098         Arg = State.get(I.value(), VPIteration(0, 0));
5099         if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index()))
5100           TysForDecl.push_back(Arg->getType());
5101       }
5102       Args.push_back(Arg);
5103     }
5104 
5105     Function *VectorF;
5106     if (UseVectorIntrinsic) {
5107       // Use vector version of the intrinsic.
5108       if (VF.isVector())
5109         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
5110       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
5111       assert(VectorF && "Can't retrieve vector intrinsic.");
5112     } else {
5113       // Use vector version of the function call.
5114       const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
5115 #ifndef NDEBUG
5116       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
5117              "Can't create vector function.");
5118 #endif
5119         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
5120     }
5121       SmallVector<OperandBundleDef, 1> OpBundles;
5122       CI->getOperandBundlesAsDefs(OpBundles);
5123       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
5124 
5125       if (isa<FPMathOperator>(V))
5126         V->copyFastMathFlags(CI);
5127 
5128       State.set(Def, V, Part);
5129       addMetadata(V, &I);
5130   }
5131 }
5132 
5133 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef,
5134                                                  VPUser &Operands,
5135                                                  bool InvariantCond,
5136                                                  VPTransformState &State) {
5137   setDebugLocFromInst(&I);
5138 
5139   // The condition can be loop invariant  but still defined inside the
5140   // loop. This means that we can't just use the original 'cond' value.
5141   // We have to take the 'vectorized' value and pick the first lane.
5142   // Instcombine will make this a no-op.
5143   auto *InvarCond = InvariantCond
5144                         ? State.get(Operands.getOperand(0), VPIteration(0, 0))
5145                         : nullptr;
5146 
5147   for (unsigned Part = 0; Part < UF; ++Part) {
5148     Value *Cond =
5149         InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
5150     Value *Op0 = State.get(Operands.getOperand(1), Part);
5151     Value *Op1 = State.get(Operands.getOperand(2), Part);
5152     Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
5153     State.set(VPDef, Sel, Part);
5154     addMetadata(Sel, &I);
5155   }
5156 }
5157 
5158 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
5159   // We should not collect Scalars more than once per VF. Right now, this
5160   // function is called from collectUniformsAndScalars(), which already does
5161   // this check. Collecting Scalars for VF=1 does not make any sense.
5162   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
5163          "This function should not be visited twice for the same VF");
5164 
5165   SmallSetVector<Instruction *, 8> Worklist;
5166 
5167   // These sets are used to seed the analysis with pointers used by memory
5168   // accesses that will remain scalar.
5169   SmallSetVector<Instruction *, 8> ScalarPtrs;
5170   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
5171   auto *Latch = TheLoop->getLoopLatch();
5172 
5173   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
5174   // The pointer operands of loads and stores will be scalar as long as the
5175   // memory access is not a gather or scatter operation. The value operand of a
5176   // store will remain scalar if the store is scalarized.
5177   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
5178     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
5179     assert(WideningDecision != CM_Unknown &&
5180            "Widening decision should be ready at this moment");
5181     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
5182       if (Ptr == Store->getValueOperand())
5183         return WideningDecision == CM_Scalarize;
5184     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
5185            "Ptr is neither a value or pointer operand");
5186     return WideningDecision != CM_GatherScatter;
5187   };
5188 
5189   // A helper that returns true if the given value is a bitcast or
5190   // getelementptr instruction contained in the loop.
5191   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
5192     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
5193             isa<GetElementPtrInst>(V)) &&
5194            !TheLoop->isLoopInvariant(V);
5195   };
5196 
5197   auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
5198     if (!isa<PHINode>(Ptr) ||
5199         !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
5200       return false;
5201     auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
5202     if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
5203       return false;
5204     return isScalarUse(MemAccess, Ptr);
5205   };
5206 
5207   // A helper that evaluates a memory access's use of a pointer. If the
5208   // pointer is actually the pointer induction of a loop, it is being
5209   // inserted into Worklist. If the use will be a scalar use, and the
5210   // pointer is only used by memory accesses, we place the pointer in
5211   // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
5212   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
5213     if (isScalarPtrInduction(MemAccess, Ptr)) {
5214       Worklist.insert(cast<Instruction>(Ptr));
5215       Instruction *Update = cast<Instruction>(
5216           cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
5217       Worklist.insert(Update);
5218       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
5219                         << "\n");
5220       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update
5221                         << "\n");
5222       return;
5223     }
5224     // We only care about bitcast and getelementptr instructions contained in
5225     // the loop.
5226     if (!isLoopVaryingBitCastOrGEP(Ptr))
5227       return;
5228 
5229     // If the pointer has already been identified as scalar (e.g., if it was
5230     // also identified as uniform), there's nothing to do.
5231     auto *I = cast<Instruction>(Ptr);
5232     if (Worklist.count(I))
5233       return;
5234 
5235     // If the use of the pointer will be a scalar use, and all users of the
5236     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
5237     // place the pointer in PossibleNonScalarPtrs.
5238     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
5239           return isa<LoadInst>(U) || isa<StoreInst>(U);
5240         }))
5241       ScalarPtrs.insert(I);
5242     else
5243       PossibleNonScalarPtrs.insert(I);
5244   };
5245 
5246   // We seed the scalars analysis with three classes of instructions: (1)
5247   // instructions marked uniform-after-vectorization and (2) bitcast,
5248   // getelementptr and (pointer) phi instructions used by memory accesses
5249   // requiring a scalar use.
5250   //
5251   // (1) Add to the worklist all instructions that have been identified as
5252   // uniform-after-vectorization.
5253   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
5254 
5255   // (2) Add to the worklist all bitcast and getelementptr instructions used by
5256   // memory accesses requiring a scalar use. The pointer operands of loads and
5257   // stores will be scalar as long as the memory accesses is not a gather or
5258   // scatter operation. The value operand of a store will remain scalar if the
5259   // store is scalarized.
5260   for (auto *BB : TheLoop->blocks())
5261     for (auto &I : *BB) {
5262       if (auto *Load = dyn_cast<LoadInst>(&I)) {
5263         evaluatePtrUse(Load, Load->getPointerOperand());
5264       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
5265         evaluatePtrUse(Store, Store->getPointerOperand());
5266         evaluatePtrUse(Store, Store->getValueOperand());
5267       }
5268     }
5269   for (auto *I : ScalarPtrs)
5270     if (!PossibleNonScalarPtrs.count(I)) {
5271       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
5272       Worklist.insert(I);
5273     }
5274 
5275   // Insert the forced scalars.
5276   // FIXME: Currently widenPHIInstruction() often creates a dead vector
5277   // induction variable when the PHI user is scalarized.
5278   auto ForcedScalar = ForcedScalars.find(VF);
5279   if (ForcedScalar != ForcedScalars.end())
5280     for (auto *I : ForcedScalar->second)
5281       Worklist.insert(I);
5282 
5283   // Expand the worklist by looking through any bitcasts and getelementptr
5284   // instructions we've already identified as scalar. This is similar to the
5285   // expansion step in collectLoopUniforms(); however, here we're only
5286   // expanding to include additional bitcasts and getelementptr instructions.
5287   unsigned Idx = 0;
5288   while (Idx != Worklist.size()) {
5289     Instruction *Dst = Worklist[Idx++];
5290     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
5291       continue;
5292     auto *Src = cast<Instruction>(Dst->getOperand(0));
5293     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
5294           auto *J = cast<Instruction>(U);
5295           return !TheLoop->contains(J) || Worklist.count(J) ||
5296                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
5297                   isScalarUse(J, Src));
5298         })) {
5299       Worklist.insert(Src);
5300       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
5301     }
5302   }
5303 
5304   // An induction variable will remain scalar if all users of the induction
5305   // variable and induction variable update remain scalar.
5306   for (auto &Induction : Legal->getInductionVars()) {
5307     auto *Ind = Induction.first;
5308     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5309 
5310     // If tail-folding is applied, the primary induction variable will be used
5311     // to feed a vector compare.
5312     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
5313       continue;
5314 
5315     // Determine if all users of the induction variable are scalar after
5316     // vectorization.
5317     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5318       auto *I = cast<Instruction>(U);
5319       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
5320     });
5321     if (!ScalarInd)
5322       continue;
5323 
5324     // Determine if all users of the induction variable update instruction are
5325     // scalar after vectorization.
5326     auto ScalarIndUpdate =
5327         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5328           auto *I = cast<Instruction>(U);
5329           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
5330         });
5331     if (!ScalarIndUpdate)
5332       continue;
5333 
5334     // The induction variable and its update instruction will remain scalar.
5335     Worklist.insert(Ind);
5336     Worklist.insert(IndUpdate);
5337     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
5338     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
5339                       << "\n");
5340   }
5341 
5342   Scalars[VF].insert(Worklist.begin(), Worklist.end());
5343 }
5344 
5345 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const {
5346   if (!blockNeedsPredication(I->getParent()))
5347     return false;
5348   switch(I->getOpcode()) {
5349   default:
5350     break;
5351   case Instruction::Load:
5352   case Instruction::Store: {
5353     if (!Legal->isMaskRequired(I))
5354       return false;
5355     auto *Ptr = getLoadStorePointerOperand(I);
5356     auto *Ty = getLoadStoreType(I);
5357     const Align Alignment = getLoadStoreAlignment(I);
5358     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
5359                                 TTI.isLegalMaskedGather(Ty, Alignment))
5360                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
5361                                 TTI.isLegalMaskedScatter(Ty, Alignment));
5362   }
5363   case Instruction::UDiv:
5364   case Instruction::SDiv:
5365   case Instruction::SRem:
5366   case Instruction::URem:
5367     return mayDivideByZero(*I);
5368   }
5369   return false;
5370 }
5371 
5372 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
5373     Instruction *I, ElementCount VF) {
5374   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
5375   assert(getWideningDecision(I, VF) == CM_Unknown &&
5376          "Decision should not be set yet.");
5377   auto *Group = getInterleavedAccessGroup(I);
5378   assert(Group && "Must have a group.");
5379 
5380   // If the instruction's allocated size doesn't equal it's type size, it
5381   // requires padding and will be scalarized.
5382   auto &DL = I->getModule()->getDataLayout();
5383   auto *ScalarTy = getLoadStoreType(I);
5384   if (hasIrregularType(ScalarTy, DL))
5385     return false;
5386 
5387   // Check if masking is required.
5388   // A Group may need masking for one of two reasons: it resides in a block that
5389   // needs predication, or it was decided to use masking to deal with gaps.
5390   bool PredicatedAccessRequiresMasking =
5391       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
5392   bool AccessWithGapsRequiresMasking =
5393       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5394   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
5395     return true;
5396 
5397   // If masked interleaving is required, we expect that the user/target had
5398   // enabled it, because otherwise it either wouldn't have been created or
5399   // it should have been invalidated by the CostModel.
5400   assert(useMaskedInterleavedAccesses(TTI) &&
5401          "Masked interleave-groups for predicated accesses are not enabled.");
5402 
5403   auto *Ty = getLoadStoreType(I);
5404   const Align Alignment = getLoadStoreAlignment(I);
5405   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
5406                           : TTI.isLegalMaskedStore(Ty, Alignment);
5407 }
5408 
5409 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
5410     Instruction *I, ElementCount VF) {
5411   // Get and ensure we have a valid memory instruction.
5412   LoadInst *LI = dyn_cast<LoadInst>(I);
5413   StoreInst *SI = dyn_cast<StoreInst>(I);
5414   assert((LI || SI) && "Invalid memory instruction");
5415 
5416   auto *Ptr = getLoadStorePointerOperand(I);
5417 
5418   // In order to be widened, the pointer should be consecutive, first of all.
5419   if (!Legal->isConsecutivePtr(Ptr))
5420     return false;
5421 
5422   // If the instruction is a store located in a predicated block, it will be
5423   // scalarized.
5424   if (isScalarWithPredication(I))
5425     return false;
5426 
5427   // If the instruction's allocated size doesn't equal it's type size, it
5428   // requires padding and will be scalarized.
5429   auto &DL = I->getModule()->getDataLayout();
5430   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
5431   if (hasIrregularType(ScalarTy, DL))
5432     return false;
5433 
5434   return true;
5435 }
5436 
5437 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
5438   // We should not collect Uniforms more than once per VF. Right now,
5439   // this function is called from collectUniformsAndScalars(), which
5440   // already does this check. Collecting Uniforms for VF=1 does not make any
5441   // sense.
5442 
5443   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
5444          "This function should not be visited twice for the same VF");
5445 
5446   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
5447   // not analyze again.  Uniforms.count(VF) will return 1.
5448   Uniforms[VF].clear();
5449 
5450   // We now know that the loop is vectorizable!
5451   // Collect instructions inside the loop that will remain uniform after
5452   // vectorization.
5453 
5454   // Global values, params and instructions outside of current loop are out of
5455   // scope.
5456   auto isOutOfScope = [&](Value *V) -> bool {
5457     Instruction *I = dyn_cast<Instruction>(V);
5458     return (!I || !TheLoop->contains(I));
5459   };
5460 
5461   SetVector<Instruction *> Worklist;
5462   BasicBlock *Latch = TheLoop->getLoopLatch();
5463 
5464   // Instructions that are scalar with predication must not be considered
5465   // uniform after vectorization, because that would create an erroneous
5466   // replicating region where only a single instance out of VF should be formed.
5467   // TODO: optimize such seldom cases if found important, see PR40816.
5468   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
5469     if (isOutOfScope(I)) {
5470       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
5471                         << *I << "\n");
5472       return;
5473     }
5474     if (isScalarWithPredication(I)) {
5475       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
5476                         << *I << "\n");
5477       return;
5478     }
5479     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
5480     Worklist.insert(I);
5481   };
5482 
5483   // Start with the conditional branch. If the branch condition is an
5484   // instruction contained in the loop that is only used by the branch, it is
5485   // uniform.
5486   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
5487   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
5488     addToWorklistIfAllowed(Cmp);
5489 
5490   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
5491     InstWidening WideningDecision = getWideningDecision(I, VF);
5492     assert(WideningDecision != CM_Unknown &&
5493            "Widening decision should be ready at this moment");
5494 
5495     // A uniform memory op is itself uniform.  We exclude uniform stores
5496     // here as they demand the last lane, not the first one.
5497     if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
5498       assert(WideningDecision == CM_Scalarize);
5499       return true;
5500     }
5501 
5502     return (WideningDecision == CM_Widen ||
5503             WideningDecision == CM_Widen_Reverse ||
5504             WideningDecision == CM_Interleave);
5505   };
5506 
5507 
5508   // Returns true if Ptr is the pointer operand of a memory access instruction
5509   // I, and I is known to not require scalarization.
5510   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5511     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
5512   };
5513 
5514   // Holds a list of values which are known to have at least one uniform use.
5515   // Note that there may be other uses which aren't uniform.  A "uniform use"
5516   // here is something which only demands lane 0 of the unrolled iterations;
5517   // it does not imply that all lanes produce the same value (e.g. this is not
5518   // the usual meaning of uniform)
5519   SetVector<Value *> HasUniformUse;
5520 
5521   // Scan the loop for instructions which are either a) known to have only
5522   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
5523   for (auto *BB : TheLoop->blocks())
5524     for (auto &I : *BB) {
5525       // If there's no pointer operand, there's nothing to do.
5526       auto *Ptr = getLoadStorePointerOperand(&I);
5527       if (!Ptr)
5528         continue;
5529 
5530       // A uniform memory op is itself uniform.  We exclude uniform stores
5531       // here as they demand the last lane, not the first one.
5532       if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
5533         addToWorklistIfAllowed(&I);
5534 
5535       if (isUniformDecision(&I, VF)) {
5536         assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
5537         HasUniformUse.insert(Ptr);
5538       }
5539     }
5540 
5541   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
5542   // demanding) users.  Since loops are assumed to be in LCSSA form, this
5543   // disallows uses outside the loop as well.
5544   for (auto *V : HasUniformUse) {
5545     if (isOutOfScope(V))
5546       continue;
5547     auto *I = cast<Instruction>(V);
5548     auto UsersAreMemAccesses =
5549       llvm::all_of(I->users(), [&](User *U) -> bool {
5550         return isVectorizedMemAccessUse(cast<Instruction>(U), V);
5551       });
5552     if (UsersAreMemAccesses)
5553       addToWorklistIfAllowed(I);
5554   }
5555 
5556   // Expand Worklist in topological order: whenever a new instruction
5557   // is added , its users should be already inside Worklist.  It ensures
5558   // a uniform instruction will only be used by uniform instructions.
5559   unsigned idx = 0;
5560   while (idx != Worklist.size()) {
5561     Instruction *I = Worklist[idx++];
5562 
5563     for (auto OV : I->operand_values()) {
5564       // isOutOfScope operands cannot be uniform instructions.
5565       if (isOutOfScope(OV))
5566         continue;
5567       // First order recurrence Phi's should typically be considered
5568       // non-uniform.
5569       auto *OP = dyn_cast<PHINode>(OV);
5570       if (OP && Legal->isFirstOrderRecurrence(OP))
5571         continue;
5572       // If all the users of the operand are uniform, then add the
5573       // operand into the uniform worklist.
5574       auto *OI = cast<Instruction>(OV);
5575       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
5576             auto *J = cast<Instruction>(U);
5577             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
5578           }))
5579         addToWorklistIfAllowed(OI);
5580     }
5581   }
5582 
5583   // For an instruction to be added into Worklist above, all its users inside
5584   // the loop should also be in Worklist. However, this condition cannot be
5585   // true for phi nodes that form a cyclic dependence. We must process phi
5586   // nodes separately. An induction variable will remain uniform if all users
5587   // of the induction variable and induction variable update remain uniform.
5588   // The code below handles both pointer and non-pointer induction variables.
5589   for (auto &Induction : Legal->getInductionVars()) {
5590     auto *Ind = Induction.first;
5591     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5592 
5593     // Determine if all users of the induction variable are uniform after
5594     // vectorization.
5595     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5596       auto *I = cast<Instruction>(U);
5597       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
5598              isVectorizedMemAccessUse(I, Ind);
5599     });
5600     if (!UniformInd)
5601       continue;
5602 
5603     // Determine if all users of the induction variable update instruction are
5604     // uniform after vectorization.
5605     auto UniformIndUpdate =
5606         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5607           auto *I = cast<Instruction>(U);
5608           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
5609                  isVectorizedMemAccessUse(I, IndUpdate);
5610         });
5611     if (!UniformIndUpdate)
5612       continue;
5613 
5614     // The induction variable and its update instruction will remain uniform.
5615     addToWorklistIfAllowed(Ind);
5616     addToWorklistIfAllowed(IndUpdate);
5617   }
5618 
5619   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
5620 }
5621 
5622 bool LoopVectorizationCostModel::runtimeChecksRequired() {
5623   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
5624 
5625   if (Legal->getRuntimePointerChecking()->Need) {
5626     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
5627         "runtime pointer checks needed. Enable vectorization of this "
5628         "loop with '#pragma clang loop vectorize(enable)' when "
5629         "compiling with -Os/-Oz",
5630         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5631     return true;
5632   }
5633 
5634   if (!PSE.getUnionPredicate().getPredicates().empty()) {
5635     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
5636         "runtime SCEV checks needed. Enable vectorization of this "
5637         "loop with '#pragma clang loop vectorize(enable)' when "
5638         "compiling with -Os/-Oz",
5639         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5640     return true;
5641   }
5642 
5643   // FIXME: Avoid specializing for stride==1 instead of bailing out.
5644   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
5645     reportVectorizationFailure("Runtime stride check for small trip count",
5646         "runtime stride == 1 checks needed. Enable vectorization of "
5647         "this loop without such check by compiling with -Os/-Oz",
5648         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5649     return true;
5650   }
5651 
5652   return false;
5653 }
5654 
5655 ElementCount
5656 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
5657   if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
5658     reportVectorizationInfo(
5659         "Disabling scalable vectorization, because target does not "
5660         "support scalable vectors.",
5661         "ScalableVectorsUnsupported", ORE, TheLoop);
5662     return ElementCount::getScalable(0);
5663   }
5664 
5665   if (Hints->isScalableVectorizationDisabled()) {
5666     reportVectorizationInfo("Scalable vectorization is explicitly disabled",
5667                             "ScalableVectorizationDisabled", ORE, TheLoop);
5668     return ElementCount::getScalable(0);
5669   }
5670 
5671   auto MaxScalableVF = ElementCount::getScalable(
5672       std::numeric_limits<ElementCount::ScalarTy>::max());
5673 
5674   // Disable scalable vectorization if the loop contains unsupported reductions.
5675   // Test that the loop-vectorizer can legalize all operations for this MaxVF.
5676   // FIXME: While for scalable vectors this is currently sufficient, this should
5677   // be replaced by a more detailed mechanism that filters out specific VFs,
5678   // instead of invalidating vectorization for a whole set of VFs based on the
5679   // MaxVF.
5680   if (!canVectorizeReductions(MaxScalableVF)) {
5681     reportVectorizationInfo(
5682         "Scalable vectorization not supported for the reduction "
5683         "operations found in this loop.",
5684         "ScalableVFUnfeasible", ORE, TheLoop);
5685     return ElementCount::getScalable(0);
5686   }
5687 
5688   if (Legal->isSafeForAnyVectorWidth())
5689     return MaxScalableVF;
5690 
5691   // Limit MaxScalableVF by the maximum safe dependence distance.
5692   Optional<unsigned> MaxVScale = TTI.getMaxVScale();
5693   MaxScalableVF = ElementCount::getScalable(
5694       MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
5695   if (!MaxScalableVF)
5696     reportVectorizationInfo(
5697         "Max legal vector width too small, scalable vectorization "
5698         "unfeasible.",
5699         "ScalableVFUnfeasible", ORE, TheLoop);
5700 
5701   return MaxScalableVF;
5702 }
5703 
5704 FixedScalableVFPair
5705 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
5706                                                  ElementCount UserVF) {
5707   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5708   unsigned SmallestType, WidestType;
5709   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5710 
5711   // Get the maximum safe dependence distance in bits computed by LAA.
5712   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5713   // the memory accesses that is most restrictive (involved in the smallest
5714   // dependence distance).
5715   unsigned MaxSafeElements =
5716       PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
5717 
5718   auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
5719   auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
5720 
5721   LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
5722                     << ".\n");
5723   LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
5724                     << ".\n");
5725 
5726   // First analyze the UserVF, fall back if the UserVF should be ignored.
5727   if (UserVF) {
5728     auto MaxSafeUserVF =
5729         UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
5730 
5731     if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF))
5732       return UserVF;
5733 
5734     assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
5735 
5736     // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
5737     // is better to ignore the hint and let the compiler choose a suitable VF.
5738     if (!UserVF.isScalable()) {
5739       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5740                         << " is unsafe, clamping to max safe VF="
5741                         << MaxSafeFixedVF << ".\n");
5742       ORE->emit([&]() {
5743         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5744                                           TheLoop->getStartLoc(),
5745                                           TheLoop->getHeader())
5746                << "User-specified vectorization factor "
5747                << ore::NV("UserVectorizationFactor", UserVF)
5748                << " is unsafe, clamping to maximum safe vectorization factor "
5749                << ore::NV("VectorizationFactor", MaxSafeFixedVF);
5750       });
5751       return MaxSafeFixedVF;
5752     }
5753 
5754     LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5755                       << " is unsafe. Ignoring scalable UserVF.\n");
5756     ORE->emit([&]() {
5757       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5758                                         TheLoop->getStartLoc(),
5759                                         TheLoop->getHeader())
5760              << "User-specified vectorization factor "
5761              << ore::NV("UserVectorizationFactor", UserVF)
5762              << " is unsafe. Ignoring the hint to let the compiler pick a "
5763                 "suitable VF.";
5764     });
5765   }
5766 
5767   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5768                     << " / " << WidestType << " bits.\n");
5769 
5770   FixedScalableVFPair Result(ElementCount::getFixed(1),
5771                              ElementCount::getScalable(0));
5772   if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType,
5773                                            WidestType, MaxSafeFixedVF))
5774     Result.FixedVF = MaxVF;
5775 
5776   if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType,
5777                                            WidestType, MaxSafeScalableVF))
5778     if (MaxVF.isScalable()) {
5779       Result.ScalableVF = MaxVF;
5780       LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
5781                         << "\n");
5782     }
5783 
5784   return Result;
5785 }
5786 
5787 FixedScalableVFPair
5788 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5789   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5790     // TODO: It may by useful to do since it's still likely to be dynamically
5791     // uniform if the target can skip.
5792     reportVectorizationFailure(
5793         "Not inserting runtime ptr check for divergent target",
5794         "runtime pointer checks needed. Not enabled for divergent target",
5795         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5796     return FixedScalableVFPair::getNone();
5797   }
5798 
5799   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5800   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5801   if (TC == 1) {
5802     reportVectorizationFailure("Single iteration (non) loop",
5803         "loop trip count is one, irrelevant for vectorization",
5804         "SingleIterationLoop", ORE, TheLoop);
5805     return FixedScalableVFPair::getNone();
5806   }
5807 
5808   switch (ScalarEpilogueStatus) {
5809   case CM_ScalarEpilogueAllowed:
5810     return computeFeasibleMaxVF(TC, UserVF);
5811   case CM_ScalarEpilogueNotAllowedUsePredicate:
5812     LLVM_FALLTHROUGH;
5813   case CM_ScalarEpilogueNotNeededUsePredicate:
5814     LLVM_DEBUG(
5815         dbgs() << "LV: vector predicate hint/switch found.\n"
5816                << "LV: Not allowing scalar epilogue, creating predicated "
5817                << "vector loop.\n");
5818     break;
5819   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5820     // fallthrough as a special case of OptForSize
5821   case CM_ScalarEpilogueNotAllowedOptSize:
5822     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5823       LLVM_DEBUG(
5824           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5825     else
5826       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5827                         << "count.\n");
5828 
5829     // Bail if runtime checks are required, which are not good when optimising
5830     // for size.
5831     if (runtimeChecksRequired())
5832       return FixedScalableVFPair::getNone();
5833 
5834     break;
5835   }
5836 
5837   // The only loops we can vectorize without a scalar epilogue, are loops with
5838   // a bottom-test and a single exiting block. We'd have to handle the fact
5839   // that not every instruction executes on the last iteration.  This will
5840   // require a lane mask which varies through the vector loop body.  (TODO)
5841   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
5842     // If there was a tail-folding hint/switch, but we can't fold the tail by
5843     // masking, fallback to a vectorization with a scalar epilogue.
5844     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5845       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5846                            "scalar epilogue instead.\n");
5847       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5848       return computeFeasibleMaxVF(TC, UserVF);
5849     }
5850     return FixedScalableVFPair::getNone();
5851   }
5852 
5853   // Now try the tail folding
5854 
5855   // Invalidate interleave groups that require an epilogue if we can't mask
5856   // the interleave-group.
5857   if (!useMaskedInterleavedAccesses(TTI)) {
5858     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5859            "No decisions should have been taken at this point");
5860     // Note: There is no need to invalidate any cost modeling decisions here, as
5861     // non where taken so far.
5862     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5863   }
5864 
5865   FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF);
5866   // Avoid tail folding if the trip count is known to be a multiple of any VF
5867   // we chose.
5868   // FIXME: The condition below pessimises the case for fixed-width vectors,
5869   // when scalable VFs are also candidates for vectorization.
5870   if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) {
5871     ElementCount MaxFixedVF = MaxFactors.FixedVF;
5872     assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) &&
5873            "MaxFixedVF must be a power of 2");
5874     unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC
5875                                    : MaxFixedVF.getFixedValue();
5876     ScalarEvolution *SE = PSE.getSE();
5877     const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
5878     const SCEV *ExitCount = SE->getAddExpr(
5879         BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
5880     const SCEV *Rem = SE->getURemExpr(
5881         SE->applyLoopGuards(ExitCount, TheLoop),
5882         SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
5883     if (Rem->isZero()) {
5884       // Accept MaxFixedVF if we do not have a tail.
5885       LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5886       return MaxFactors;
5887     }
5888   }
5889 
5890   // If we don't know the precise trip count, or if the trip count that we
5891   // found modulo the vectorization factor is not zero, try to fold the tail
5892   // by masking.
5893   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5894   if (Legal->prepareToFoldTailByMasking()) {
5895     FoldTailByMasking = true;
5896     return MaxFactors;
5897   }
5898 
5899   // If there was a tail-folding hint/switch, but we can't fold the tail by
5900   // masking, fallback to a vectorization with a scalar epilogue.
5901   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5902     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5903                          "scalar epilogue instead.\n");
5904     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5905     return MaxFactors;
5906   }
5907 
5908   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
5909     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5910     return FixedScalableVFPair::getNone();
5911   }
5912 
5913   if (TC == 0) {
5914     reportVectorizationFailure(
5915         "Unable to calculate the loop count due to complex control flow",
5916         "unable to calculate the loop count due to complex control flow",
5917         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5918     return FixedScalableVFPair::getNone();
5919   }
5920 
5921   reportVectorizationFailure(
5922       "Cannot optimize for size and vectorize at the same time.",
5923       "cannot optimize for size and vectorize at the same time. "
5924       "Enable vectorization of this loop with '#pragma clang loop "
5925       "vectorize(enable)' when compiling with -Os/-Oz",
5926       "NoTailLoopWithOptForSize", ORE, TheLoop);
5927   return FixedScalableVFPair::getNone();
5928 }
5929 
5930 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
5931     unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
5932     const ElementCount &MaxSafeVF) {
5933   bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
5934   TypeSize WidestRegister = TTI.getRegisterBitWidth(
5935       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5936                            : TargetTransformInfo::RGK_FixedWidthVector);
5937 
5938   // Convenience function to return the minimum of two ElementCounts.
5939   auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
5940     assert((LHS.isScalable() == RHS.isScalable()) &&
5941            "Scalable flags must match");
5942     return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
5943   };
5944 
5945   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5946   // Note that both WidestRegister and WidestType may not be a powers of 2.
5947   auto MaxVectorElementCount = ElementCount::get(
5948       PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType),
5949       ComputeScalableMaxVF);
5950   MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
5951   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5952                     << (MaxVectorElementCount * WidestType) << " bits.\n");
5953 
5954   if (!MaxVectorElementCount) {
5955     LLVM_DEBUG(dbgs() << "LV: The target has no "
5956                       << (ComputeScalableMaxVF ? "scalable" : "fixed")
5957                       << " vector registers.\n");
5958     return ElementCount::getFixed(1);
5959   }
5960 
5961   const auto TripCountEC = ElementCount::getFixed(ConstTripCount);
5962   if (ConstTripCount &&
5963       ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) &&
5964       isPowerOf2_32(ConstTripCount)) {
5965     // We need to clamp the VF to be the ConstTripCount. There is no point in
5966     // choosing a higher viable VF as done in the loop below. If
5967     // MaxVectorElementCount is scalable, we only fall back on a fixed VF when
5968     // the TC is less than or equal to the known number of lanes.
5969     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5970                       << ConstTripCount << "\n");
5971     return TripCountEC;
5972   }
5973 
5974   ElementCount MaxVF = MaxVectorElementCount;
5975   if (TTI.shouldMaximizeVectorBandwidth() ||
5976       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5977     auto MaxVectorElementCountMaxBW = ElementCount::get(
5978         PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType),
5979         ComputeScalableMaxVF);
5980     MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
5981 
5982     // Collect all viable vectorization factors larger than the default MaxVF
5983     // (i.e. MaxVectorElementCount).
5984     SmallVector<ElementCount, 8> VFs;
5985     for (ElementCount VS = MaxVectorElementCount * 2;
5986          ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
5987       VFs.push_back(VS);
5988 
5989     // For each VF calculate its register usage.
5990     auto RUs = calculateRegisterUsage(VFs);
5991 
5992     // Select the largest VF which doesn't require more registers than existing
5993     // ones.
5994     for (int i = RUs.size() - 1; i >= 0; --i) {
5995       bool Selected = true;
5996       for (auto &pair : RUs[i].MaxLocalUsers) {
5997         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5998         if (pair.second > TargetNumRegisters)
5999           Selected = false;
6000       }
6001       if (Selected) {
6002         MaxVF = VFs[i];
6003         break;
6004       }
6005     }
6006     if (ElementCount MinVF =
6007             TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
6008       if (ElementCount::isKnownLT(MaxVF, MinVF)) {
6009         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
6010                           << ") with target's minimum: " << MinVF << '\n');
6011         MaxVF = MinVF;
6012       }
6013     }
6014   }
6015   return MaxVF;
6016 }
6017 
6018 bool LoopVectorizationCostModel::isMoreProfitable(
6019     const VectorizationFactor &A, const VectorizationFactor &B) const {
6020   InstructionCost::CostType CostA = *A.Cost.getValue();
6021   InstructionCost::CostType CostB = *B.Cost.getValue();
6022 
6023   unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
6024 
6025   if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking &&
6026       MaxTripCount) {
6027     // If we are folding the tail and the trip count is a known (possibly small)
6028     // constant, the trip count will be rounded up to an integer number of
6029     // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF),
6030     // which we compare directly. When not folding the tail, the total cost will
6031     // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is
6032     // approximated with the per-lane cost below instead of using the tripcount
6033     // as here.
6034     int64_t RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue());
6035     int64_t RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue());
6036     return RTCostA < RTCostB;
6037   }
6038 
6039   // When set to preferred, for now assume vscale may be larger than 1, so
6040   // that scalable vectorization is slightly favorable over fixed-width
6041   // vectorization.
6042   if (Hints->isScalableVectorizationPreferred())
6043     if (A.Width.isScalable() && !B.Width.isScalable())
6044       return (CostA * B.Width.getKnownMinValue()) <=
6045              (CostB * A.Width.getKnownMinValue());
6046 
6047   // To avoid the need for FP division:
6048   //      (CostA / A.Width) < (CostB / B.Width)
6049   // <=>  (CostA * B.Width) < (CostB * A.Width)
6050   return (CostA * B.Width.getKnownMinValue()) <
6051          (CostB * A.Width.getKnownMinValue());
6052 }
6053 
6054 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
6055     const ElementCountSet &VFCandidates) {
6056   InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
6057   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
6058   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
6059   assert(VFCandidates.count(ElementCount::getFixed(1)) &&
6060          "Expected Scalar VF to be a candidate");
6061 
6062   const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost);
6063   VectorizationFactor ChosenFactor = ScalarCost;
6064 
6065   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
6066   if (ForceVectorization && VFCandidates.size() > 1) {
6067     // Ignore scalar width, because the user explicitly wants vectorization.
6068     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
6069     // evaluation.
6070     ChosenFactor.Cost = std::numeric_limits<InstructionCost::CostType>::max();
6071   }
6072 
6073   for (const auto &i : VFCandidates) {
6074     // The cost for scalar VF=1 is already calculated, so ignore it.
6075     if (i.isScalar())
6076       continue;
6077 
6078     // Notice that the vector loop needs to be executed less times, so
6079     // we need to divide the cost of the vector loops by the width of
6080     // the vector elements.
6081     VectorizationCostTy C = expectedCost(i);
6082 
6083     assert(C.first.isValid() && "Unexpected invalid cost for vector loop");
6084     VectorizationFactor Candidate(i, C.first);
6085     LLVM_DEBUG(
6086         dbgs() << "LV: Vector loop of width " << i << " costs: "
6087                << (*Candidate.Cost.getValue() /
6088                    Candidate.Width.getKnownMinValue())
6089                << (i.isScalable() ? " (assuming a minimum vscale of 1)" : "")
6090                << ".\n");
6091 
6092     if (!C.second && !ForceVectorization) {
6093       LLVM_DEBUG(
6094           dbgs() << "LV: Not considering vector loop of width " << i
6095                  << " because it will not generate any vector instructions.\n");
6096       continue;
6097     }
6098 
6099     // If profitable add it to ProfitableVF list.
6100     if (isMoreProfitable(Candidate, ScalarCost))
6101       ProfitableVFs.push_back(Candidate);
6102 
6103     if (isMoreProfitable(Candidate, ChosenFactor))
6104       ChosenFactor = Candidate;
6105   }
6106 
6107   if (!EnableCondStoresVectorization && NumPredStores) {
6108     reportVectorizationFailure("There are conditional stores.",
6109         "store that is conditionally executed prevents vectorization",
6110         "ConditionalStore", ORE, TheLoop);
6111     ChosenFactor = ScalarCost;
6112   }
6113 
6114   LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
6115                  *ChosenFactor.Cost.getValue() >= *ScalarCost.Cost.getValue())
6116                  dbgs()
6117              << "LV: Vectorization seems to be not beneficial, "
6118              << "but was forced by a user.\n");
6119   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
6120   return ChosenFactor;
6121 }
6122 
6123 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
6124     const Loop &L, ElementCount VF) const {
6125   // Cross iteration phis such as reductions need special handling and are
6126   // currently unsupported.
6127   if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) {
6128         return Legal->isFirstOrderRecurrence(&Phi) ||
6129                Legal->isReductionVariable(&Phi);
6130       }))
6131     return false;
6132 
6133   // Phis with uses outside of the loop require special handling and are
6134   // currently unsupported.
6135   for (auto &Entry : Legal->getInductionVars()) {
6136     // Look for uses of the value of the induction at the last iteration.
6137     Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
6138     for (User *U : PostInc->users())
6139       if (!L.contains(cast<Instruction>(U)))
6140         return false;
6141     // Look for uses of penultimate value of the induction.
6142     for (User *U : Entry.first->users())
6143       if (!L.contains(cast<Instruction>(U)))
6144         return false;
6145   }
6146 
6147   // Induction variables that are widened require special handling that is
6148   // currently not supported.
6149   if (any_of(Legal->getInductionVars(), [&](auto &Entry) {
6150         return !(this->isScalarAfterVectorization(Entry.first, VF) ||
6151                  this->isProfitableToScalarize(Entry.first, VF));
6152       }))
6153     return false;
6154 
6155   return true;
6156 }
6157 
6158 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
6159     const ElementCount VF) const {
6160   // FIXME: We need a much better cost-model to take different parameters such
6161   // as register pressure, code size increase and cost of extra branches into
6162   // account. For now we apply a very crude heuristic and only consider loops
6163   // with vectorization factors larger than a certain value.
6164   // We also consider epilogue vectorization unprofitable for targets that don't
6165   // consider interleaving beneficial (eg. MVE).
6166   if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
6167     return false;
6168   if (VF.getFixedValue() >= EpilogueVectorizationMinVF)
6169     return true;
6170   return false;
6171 }
6172 
6173 VectorizationFactor
6174 LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
6175     const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
6176   VectorizationFactor Result = VectorizationFactor::Disabled();
6177   if (!EnableEpilogueVectorization) {
6178     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
6179     return Result;
6180   }
6181 
6182   if (!isScalarEpilogueAllowed()) {
6183     LLVM_DEBUG(
6184         dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
6185                   "allowed.\n";);
6186     return Result;
6187   }
6188 
6189   // FIXME: This can be fixed for scalable vectors later, because at this stage
6190   // the LoopVectorizer will only consider vectorizing a loop with scalable
6191   // vectors when the loop has a hint to enable vectorization for a given VF.
6192   if (MainLoopVF.isScalable()) {
6193     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not "
6194                          "yet supported.\n");
6195     return Result;
6196   }
6197 
6198   // Not really a cost consideration, but check for unsupported cases here to
6199   // simplify the logic.
6200   if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
6201     LLVM_DEBUG(
6202         dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
6203                   "not a supported candidate.\n";);
6204     return Result;
6205   }
6206 
6207   if (EpilogueVectorizationForceVF > 1) {
6208     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
6209     if (LVP.hasPlanWithVFs(
6210             {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)}))
6211       return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0};
6212     else {
6213       LLVM_DEBUG(
6214           dbgs()
6215               << "LEV: Epilogue vectorization forced factor is not viable.\n";);
6216       return Result;
6217     }
6218   }
6219 
6220   if (TheLoop->getHeader()->getParent()->hasOptSize() ||
6221       TheLoop->getHeader()->getParent()->hasMinSize()) {
6222     LLVM_DEBUG(
6223         dbgs()
6224             << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
6225     return Result;
6226   }
6227 
6228   if (!isEpilogueVectorizationProfitable(MainLoopVF))
6229     return Result;
6230 
6231   for (auto &NextVF : ProfitableVFs)
6232     if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) &&
6233         (Result.Width.getFixedValue() == 1 ||
6234          isMoreProfitable(NextVF, Result)) &&
6235         LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width}))
6236       Result = NextVF;
6237 
6238   if (Result != VectorizationFactor::Disabled())
6239     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
6240                       << Result.Width.getFixedValue() << "\n";);
6241   return Result;
6242 }
6243 
6244 std::pair<unsigned, unsigned>
6245 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
6246   unsigned MinWidth = -1U;
6247   unsigned MaxWidth = 8;
6248   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
6249 
6250   // For each block.
6251   for (BasicBlock *BB : TheLoop->blocks()) {
6252     // For each instruction in the loop.
6253     for (Instruction &I : BB->instructionsWithoutDebug()) {
6254       Type *T = I.getType();
6255 
6256       // Skip ignored values.
6257       if (ValuesToIgnore.count(&I))
6258         continue;
6259 
6260       // Only examine Loads, Stores and PHINodes.
6261       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
6262         continue;
6263 
6264       // Examine PHI nodes that are reduction variables. Update the type to
6265       // account for the recurrence type.
6266       if (auto *PN = dyn_cast<PHINode>(&I)) {
6267         if (!Legal->isReductionVariable(PN))
6268           continue;
6269         const RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[PN];
6270         if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
6271             TTI.preferInLoopReduction(RdxDesc.getOpcode(),
6272                                       RdxDesc.getRecurrenceType(),
6273                                       TargetTransformInfo::ReductionFlags()))
6274           continue;
6275         T = RdxDesc.getRecurrenceType();
6276       }
6277 
6278       // Examine the stored values.
6279       if (auto *ST = dyn_cast<StoreInst>(&I))
6280         T = ST->getValueOperand()->getType();
6281 
6282       // Ignore loaded pointer types and stored pointer types that are not
6283       // vectorizable.
6284       //
6285       // FIXME: The check here attempts to predict whether a load or store will
6286       //        be vectorized. We only know this for certain after a VF has
6287       //        been selected. Here, we assume that if an access can be
6288       //        vectorized, it will be. We should also look at extending this
6289       //        optimization to non-pointer types.
6290       //
6291       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
6292           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
6293         continue;
6294 
6295       MinWidth = std::min(MinWidth,
6296                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
6297       MaxWidth = std::max(MaxWidth,
6298                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
6299     }
6300   }
6301 
6302   return {MinWidth, MaxWidth};
6303 }
6304 
6305 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
6306                                                            unsigned LoopCost) {
6307   // -- The interleave heuristics --
6308   // We interleave the loop in order to expose ILP and reduce the loop overhead.
6309   // There are many micro-architectural considerations that we can't predict
6310   // at this level. For example, frontend pressure (on decode or fetch) due to
6311   // code size, or the number and capabilities of the execution ports.
6312   //
6313   // We use the following heuristics to select the interleave count:
6314   // 1. If the code has reductions, then we interleave to break the cross
6315   // iteration dependency.
6316   // 2. If the loop is really small, then we interleave to reduce the loop
6317   // overhead.
6318   // 3. We don't interleave if we think that we will spill registers to memory
6319   // due to the increased register pressure.
6320 
6321   if (!isScalarEpilogueAllowed())
6322     return 1;
6323 
6324   // We used the distance for the interleave count.
6325   if (Legal->getMaxSafeDepDistBytes() != -1U)
6326     return 1;
6327 
6328   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
6329   const bool HasReductions = !Legal->getReductionVars().empty();
6330   // Do not interleave loops with a relatively small known or estimated trip
6331   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
6332   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
6333   // because with the above conditions interleaving can expose ILP and break
6334   // cross iteration dependences for reductions.
6335   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
6336       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
6337     return 1;
6338 
6339   RegisterUsage R = calculateRegisterUsage({VF})[0];
6340   // We divide by these constants so assume that we have at least one
6341   // instruction that uses at least one register.
6342   for (auto& pair : R.MaxLocalUsers) {
6343     pair.second = std::max(pair.second, 1U);
6344   }
6345 
6346   // We calculate the interleave count using the following formula.
6347   // Subtract the number of loop invariants from the number of available
6348   // registers. These registers are used by all of the interleaved instances.
6349   // Next, divide the remaining registers by the number of registers that is
6350   // required by the loop, in order to estimate how many parallel instances
6351   // fit without causing spills. All of this is rounded down if necessary to be
6352   // a power of two. We want power of two interleave count to simplify any
6353   // addressing operations or alignment considerations.
6354   // We also want power of two interleave counts to ensure that the induction
6355   // variable of the vector loop wraps to zero, when tail is folded by masking;
6356   // this currently happens when OptForSize, in which case IC is set to 1 above.
6357   unsigned IC = UINT_MAX;
6358 
6359   for (auto& pair : R.MaxLocalUsers) {
6360     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
6361     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
6362                       << " registers of "
6363                       << TTI.getRegisterClassName(pair.first) << " register class\n");
6364     if (VF.isScalar()) {
6365       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
6366         TargetNumRegisters = ForceTargetNumScalarRegs;
6367     } else {
6368       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
6369         TargetNumRegisters = ForceTargetNumVectorRegs;
6370     }
6371     unsigned MaxLocalUsers = pair.second;
6372     unsigned LoopInvariantRegs = 0;
6373     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
6374       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
6375 
6376     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
6377     // Don't count the induction variable as interleaved.
6378     if (EnableIndVarRegisterHeur) {
6379       TmpIC =
6380           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
6381                         std::max(1U, (MaxLocalUsers - 1)));
6382     }
6383 
6384     IC = std::min(IC, TmpIC);
6385   }
6386 
6387   // Clamp the interleave ranges to reasonable counts.
6388   unsigned MaxInterleaveCount =
6389       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
6390 
6391   // Check if the user has overridden the max.
6392   if (VF.isScalar()) {
6393     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
6394       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
6395   } else {
6396     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
6397       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
6398   }
6399 
6400   // If trip count is known or estimated compile time constant, limit the
6401   // interleave count to be less than the trip count divided by VF, provided it
6402   // is at least 1.
6403   //
6404   // For scalable vectors we can't know if interleaving is beneficial. It may
6405   // not be beneficial for small loops if none of the lanes in the second vector
6406   // iterations is enabled. However, for larger loops, there is likely to be a
6407   // similar benefit as for fixed-width vectors. For now, we choose to leave
6408   // the InterleaveCount as if vscale is '1', although if some information about
6409   // the vector is known (e.g. min vector size), we can make a better decision.
6410   if (BestKnownTC) {
6411     MaxInterleaveCount =
6412         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
6413     // Make sure MaxInterleaveCount is greater than 0.
6414     MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
6415   }
6416 
6417   assert(MaxInterleaveCount > 0 &&
6418          "Maximum interleave count must be greater than 0");
6419 
6420   // Clamp the calculated IC to be between the 1 and the max interleave count
6421   // that the target and trip count allows.
6422   if (IC > MaxInterleaveCount)
6423     IC = MaxInterleaveCount;
6424   else
6425     // Make sure IC is greater than 0.
6426     IC = std::max(1u, IC);
6427 
6428   assert(IC > 0 && "Interleave count must be greater than 0.");
6429 
6430   // If we did not calculate the cost for VF (because the user selected the VF)
6431   // then we calculate the cost of VF here.
6432   if (LoopCost == 0) {
6433     assert(expectedCost(VF).first.isValid() && "Expected a valid cost");
6434     LoopCost = *expectedCost(VF).first.getValue();
6435   }
6436 
6437   assert(LoopCost && "Non-zero loop cost expected");
6438 
6439   // Interleave if we vectorized this loop and there is a reduction that could
6440   // benefit from interleaving.
6441   if (VF.isVector() && HasReductions) {
6442     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
6443     return IC;
6444   }
6445 
6446   // Note that if we've already vectorized the loop we will have done the
6447   // runtime check and so interleaving won't require further checks.
6448   bool InterleavingRequiresRuntimePointerCheck =
6449       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
6450 
6451   // We want to interleave small loops in order to reduce the loop overhead and
6452   // potentially expose ILP opportunities.
6453   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
6454                     << "LV: IC is " << IC << '\n'
6455                     << "LV: VF is " << VF << '\n');
6456   const bool AggressivelyInterleaveReductions =
6457       TTI.enableAggressiveInterleaving(HasReductions);
6458   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
6459     // We assume that the cost overhead is 1 and we use the cost model
6460     // to estimate the cost of the loop and interleave until the cost of the
6461     // loop overhead is about 5% of the cost of the loop.
6462     unsigned SmallIC =
6463         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
6464 
6465     // Interleave until store/load ports (estimated by max interleave count) are
6466     // saturated.
6467     unsigned NumStores = Legal->getNumStores();
6468     unsigned NumLoads = Legal->getNumLoads();
6469     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
6470     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
6471 
6472     // If we have a scalar reduction (vector reductions are already dealt with
6473     // by this point), we can increase the critical path length if the loop
6474     // we're interleaving is inside another loop. Limit, by default to 2, so the
6475     // critical path only gets increased by one reduction operation.
6476     if (HasReductions && TheLoop->getLoopDepth() > 1) {
6477       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
6478       SmallIC = std::min(SmallIC, F);
6479       StoresIC = std::min(StoresIC, F);
6480       LoadsIC = std::min(LoadsIC, F);
6481     }
6482 
6483     if (EnableLoadStoreRuntimeInterleave &&
6484         std::max(StoresIC, LoadsIC) > SmallIC) {
6485       LLVM_DEBUG(
6486           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
6487       return std::max(StoresIC, LoadsIC);
6488     }
6489 
6490     // If there are scalar reductions and TTI has enabled aggressive
6491     // interleaving for reductions, we will interleave to expose ILP.
6492     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
6493         AggressivelyInterleaveReductions) {
6494       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6495       // Interleave no less than SmallIC but not as aggressive as the normal IC
6496       // to satisfy the rare situation when resources are too limited.
6497       return std::max(IC / 2, SmallIC);
6498     } else {
6499       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
6500       return SmallIC;
6501     }
6502   }
6503 
6504   // Interleave if this is a large loop (small loops are already dealt with by
6505   // this point) that could benefit from interleaving.
6506   if (AggressivelyInterleaveReductions) {
6507     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6508     return IC;
6509   }
6510 
6511   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
6512   return 1;
6513 }
6514 
6515 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
6516 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
6517   // This function calculates the register usage by measuring the highest number
6518   // of values that are alive at a single location. Obviously, this is a very
6519   // rough estimation. We scan the loop in a topological order in order and
6520   // assign a number to each instruction. We use RPO to ensure that defs are
6521   // met before their users. We assume that each instruction that has in-loop
6522   // users starts an interval. We record every time that an in-loop value is
6523   // used, so we have a list of the first and last occurrences of each
6524   // instruction. Next, we transpose this data structure into a multi map that
6525   // holds the list of intervals that *end* at a specific location. This multi
6526   // map allows us to perform a linear search. We scan the instructions linearly
6527   // and record each time that a new interval starts, by placing it in a set.
6528   // If we find this value in the multi-map then we remove it from the set.
6529   // The max register usage is the maximum size of the set.
6530   // We also search for instructions that are defined outside the loop, but are
6531   // used inside the loop. We need this number separately from the max-interval
6532   // usage number because when we unroll, loop-invariant values do not take
6533   // more register.
6534   LoopBlocksDFS DFS(TheLoop);
6535   DFS.perform(LI);
6536 
6537   RegisterUsage RU;
6538 
6539   // Each 'key' in the map opens a new interval. The values
6540   // of the map are the index of the 'last seen' usage of the
6541   // instruction that is the key.
6542   using IntervalMap = DenseMap<Instruction *, unsigned>;
6543 
6544   // Maps instruction to its index.
6545   SmallVector<Instruction *, 64> IdxToInstr;
6546   // Marks the end of each interval.
6547   IntervalMap EndPoint;
6548   // Saves the list of instruction indices that are used in the loop.
6549   SmallPtrSet<Instruction *, 8> Ends;
6550   // Saves the list of values that are used in the loop but are
6551   // defined outside the loop, such as arguments and constants.
6552   SmallPtrSet<Value *, 8> LoopInvariants;
6553 
6554   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
6555     for (Instruction &I : BB->instructionsWithoutDebug()) {
6556       IdxToInstr.push_back(&I);
6557 
6558       // Save the end location of each USE.
6559       for (Value *U : I.operands()) {
6560         auto *Instr = dyn_cast<Instruction>(U);
6561 
6562         // Ignore non-instruction values such as arguments, constants, etc.
6563         if (!Instr)
6564           continue;
6565 
6566         // If this instruction is outside the loop then record it and continue.
6567         if (!TheLoop->contains(Instr)) {
6568           LoopInvariants.insert(Instr);
6569           continue;
6570         }
6571 
6572         // Overwrite previous end points.
6573         EndPoint[Instr] = IdxToInstr.size();
6574         Ends.insert(Instr);
6575       }
6576     }
6577   }
6578 
6579   // Saves the list of intervals that end with the index in 'key'.
6580   using InstrList = SmallVector<Instruction *, 2>;
6581   DenseMap<unsigned, InstrList> TransposeEnds;
6582 
6583   // Transpose the EndPoints to a list of values that end at each index.
6584   for (auto &Interval : EndPoint)
6585     TransposeEnds[Interval.second].push_back(Interval.first);
6586 
6587   SmallPtrSet<Instruction *, 8> OpenIntervals;
6588   SmallVector<RegisterUsage, 8> RUs(VFs.size());
6589   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
6590 
6591   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
6592 
6593   // A lambda that gets the register usage for the given type and VF.
6594   const auto &TTICapture = TTI;
6595   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) {
6596     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
6597       return 0;
6598     return *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue();
6599   };
6600 
6601   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
6602     Instruction *I = IdxToInstr[i];
6603 
6604     // Remove all of the instructions that end at this location.
6605     InstrList &List = TransposeEnds[i];
6606     for (Instruction *ToRemove : List)
6607       OpenIntervals.erase(ToRemove);
6608 
6609     // Ignore instructions that are never used within the loop.
6610     if (!Ends.count(I))
6611       continue;
6612 
6613     // Skip ignored values.
6614     if (ValuesToIgnore.count(I))
6615       continue;
6616 
6617     // For each VF find the maximum usage of registers.
6618     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
6619       // Count the number of live intervals.
6620       SmallMapVector<unsigned, unsigned, 4> RegUsage;
6621 
6622       if (VFs[j].isScalar()) {
6623         for (auto Inst : OpenIntervals) {
6624           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6625           if (RegUsage.find(ClassID) == RegUsage.end())
6626             RegUsage[ClassID] = 1;
6627           else
6628             RegUsage[ClassID] += 1;
6629         }
6630       } else {
6631         collectUniformsAndScalars(VFs[j]);
6632         for (auto Inst : OpenIntervals) {
6633           // Skip ignored values for VF > 1.
6634           if (VecValuesToIgnore.count(Inst))
6635             continue;
6636           if (isScalarAfterVectorization(Inst, VFs[j])) {
6637             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6638             if (RegUsage.find(ClassID) == RegUsage.end())
6639               RegUsage[ClassID] = 1;
6640             else
6641               RegUsage[ClassID] += 1;
6642           } else {
6643             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
6644             if (RegUsage.find(ClassID) == RegUsage.end())
6645               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
6646             else
6647               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
6648           }
6649         }
6650       }
6651 
6652       for (auto& pair : RegUsage) {
6653         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
6654           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
6655         else
6656           MaxUsages[j][pair.first] = pair.second;
6657       }
6658     }
6659 
6660     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
6661                       << OpenIntervals.size() << '\n');
6662 
6663     // Add the current instruction to the list of open intervals.
6664     OpenIntervals.insert(I);
6665   }
6666 
6667   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6668     SmallMapVector<unsigned, unsigned, 4> Invariant;
6669 
6670     for (auto Inst : LoopInvariants) {
6671       unsigned Usage =
6672           VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
6673       unsigned ClassID =
6674           TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
6675       if (Invariant.find(ClassID) == Invariant.end())
6676         Invariant[ClassID] = Usage;
6677       else
6678         Invariant[ClassID] += Usage;
6679     }
6680 
6681     LLVM_DEBUG({
6682       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
6683       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
6684              << " item\n";
6685       for (const auto &pair : MaxUsages[i]) {
6686         dbgs() << "LV(REG): RegisterClass: "
6687                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6688                << " registers\n";
6689       }
6690       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
6691              << " item\n";
6692       for (const auto &pair : Invariant) {
6693         dbgs() << "LV(REG): RegisterClass: "
6694                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6695                << " registers\n";
6696       }
6697     });
6698 
6699     RU.LoopInvariantRegs = Invariant;
6700     RU.MaxLocalUsers = MaxUsages[i];
6701     RUs[i] = RU;
6702   }
6703 
6704   return RUs;
6705 }
6706 
6707 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
6708   // TODO: Cost model for emulated masked load/store is completely
6709   // broken. This hack guides the cost model to use an artificially
6710   // high enough value to practically disable vectorization with such
6711   // operations, except where previously deployed legality hack allowed
6712   // using very low cost values. This is to avoid regressions coming simply
6713   // from moving "masked load/store" check from legality to cost model.
6714   // Masked Load/Gather emulation was previously never allowed.
6715   // Limited number of Masked Store/Scatter emulation was allowed.
6716   assert(isPredicatedInst(I) &&
6717          "Expecting a scalar emulated instruction");
6718   return isa<LoadInst>(I) ||
6719          (isa<StoreInst>(I) &&
6720           NumPredStores > NumberOfStoresToPredicate);
6721 }
6722 
6723 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
6724   // If we aren't vectorizing the loop, or if we've already collected the
6725   // instructions to scalarize, there's nothing to do. Collection may already
6726   // have occurred if we have a user-selected VF and are now computing the
6727   // expected cost for interleaving.
6728   if (VF.isScalar() || VF.isZero() ||
6729       InstsToScalarize.find(VF) != InstsToScalarize.end())
6730     return;
6731 
6732   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6733   // not profitable to scalarize any instructions, the presence of VF in the
6734   // map will indicate that we've analyzed it already.
6735   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6736 
6737   // Find all the instructions that are scalar with predication in the loop and
6738   // determine if it would be better to not if-convert the blocks they are in.
6739   // If so, we also record the instructions to scalarize.
6740   for (BasicBlock *BB : TheLoop->blocks()) {
6741     if (!blockNeedsPredication(BB))
6742       continue;
6743     for (Instruction &I : *BB)
6744       if (isScalarWithPredication(&I)) {
6745         ScalarCostsTy ScalarCosts;
6746         // Do not apply discount logic if hacked cost is needed
6747         // for emulated masked memrefs.
6748         if (!useEmulatedMaskMemRefHack(&I) &&
6749             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6750           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6751         // Remember that BB will remain after vectorization.
6752         PredicatedBBsAfterVectorization.insert(BB);
6753       }
6754   }
6755 }
6756 
6757 int LoopVectorizationCostModel::computePredInstDiscount(
6758     Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
6759   assert(!isUniformAfterVectorization(PredInst, VF) &&
6760          "Instruction marked uniform-after-vectorization will be predicated");
6761 
6762   // Initialize the discount to zero, meaning that the scalar version and the
6763   // vector version cost the same.
6764   InstructionCost Discount = 0;
6765 
6766   // Holds instructions to analyze. The instructions we visit are mapped in
6767   // ScalarCosts. Those instructions are the ones that would be scalarized if
6768   // we find that the scalar version costs less.
6769   SmallVector<Instruction *, 8> Worklist;
6770 
6771   // Returns true if the given instruction can be scalarized.
6772   auto canBeScalarized = [&](Instruction *I) -> bool {
6773     // We only attempt to scalarize instructions forming a single-use chain
6774     // from the original predicated block that would otherwise be vectorized.
6775     // Although not strictly necessary, we give up on instructions we know will
6776     // already be scalar to avoid traversing chains that are unlikely to be
6777     // beneficial.
6778     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6779         isScalarAfterVectorization(I, VF))
6780       return false;
6781 
6782     // If the instruction is scalar with predication, it will be analyzed
6783     // separately. We ignore it within the context of PredInst.
6784     if (isScalarWithPredication(I))
6785       return false;
6786 
6787     // If any of the instruction's operands are uniform after vectorization,
6788     // the instruction cannot be scalarized. This prevents, for example, a
6789     // masked load from being scalarized.
6790     //
6791     // We assume we will only emit a value for lane zero of an instruction
6792     // marked uniform after vectorization, rather than VF identical values.
6793     // Thus, if we scalarize an instruction that uses a uniform, we would
6794     // create uses of values corresponding to the lanes we aren't emitting code
6795     // for. This behavior can be changed by allowing getScalarValue to clone
6796     // the lane zero values for uniforms rather than asserting.
6797     for (Use &U : I->operands())
6798       if (auto *J = dyn_cast<Instruction>(U.get()))
6799         if (isUniformAfterVectorization(J, VF))
6800           return false;
6801 
6802     // Otherwise, we can scalarize the instruction.
6803     return true;
6804   };
6805 
6806   // Compute the expected cost discount from scalarizing the entire expression
6807   // feeding the predicated instruction. We currently only consider expressions
6808   // that are single-use instruction chains.
6809   Worklist.push_back(PredInst);
6810   while (!Worklist.empty()) {
6811     Instruction *I = Worklist.pop_back_val();
6812 
6813     // If we've already analyzed the instruction, there's nothing to do.
6814     if (ScalarCosts.find(I) != ScalarCosts.end())
6815       continue;
6816 
6817     // Compute the cost of the vector instruction. Note that this cost already
6818     // includes the scalarization overhead of the predicated instruction.
6819     InstructionCost VectorCost = getInstructionCost(I, VF).first;
6820 
6821     // Compute the cost of the scalarized instruction. This cost is the cost of
6822     // the instruction as if it wasn't if-converted and instead remained in the
6823     // predicated block. We will scale this cost by block probability after
6824     // computing the scalarization overhead.
6825     assert(!VF.isScalable() && "scalable vectors not yet supported.");
6826     InstructionCost ScalarCost =
6827         VF.getKnownMinValue() *
6828         getInstructionCost(I, ElementCount::getFixed(1)).first;
6829 
6830     // Compute the scalarization overhead of needed insertelement instructions
6831     // and phi nodes.
6832     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
6833       ScalarCost += TTI.getScalarizationOverhead(
6834           cast<VectorType>(ToVectorTy(I->getType(), VF)),
6835           APInt::getAllOnesValue(VF.getKnownMinValue()), true, false);
6836       assert(!VF.isScalable() && "scalable vectors not yet supported.");
6837       ScalarCost +=
6838           VF.getKnownMinValue() *
6839           TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
6840     }
6841 
6842     // Compute the scalarization overhead of needed extractelement
6843     // instructions. For each of the instruction's operands, if the operand can
6844     // be scalarized, add it to the worklist; otherwise, account for the
6845     // overhead.
6846     for (Use &U : I->operands())
6847       if (auto *J = dyn_cast<Instruction>(U.get())) {
6848         assert(VectorType::isValidElementType(J->getType()) &&
6849                "Instruction has non-scalar type");
6850         if (canBeScalarized(J))
6851           Worklist.push_back(J);
6852         else if (needsExtract(J, VF)) {
6853           assert(!VF.isScalable() && "scalable vectors not yet supported.");
6854           ScalarCost += TTI.getScalarizationOverhead(
6855               cast<VectorType>(ToVectorTy(J->getType(), VF)),
6856               APInt::getAllOnesValue(VF.getKnownMinValue()), false, true);
6857         }
6858       }
6859 
6860     // Scale the total scalar cost by block probability.
6861     ScalarCost /= getReciprocalPredBlockProb();
6862 
6863     // Compute the discount. A non-negative discount means the vector version
6864     // of the instruction costs more, and scalarizing would be beneficial.
6865     Discount += VectorCost - ScalarCost;
6866     ScalarCosts[I] = ScalarCost;
6867   }
6868 
6869   return *Discount.getValue();
6870 }
6871 
6872 LoopVectorizationCostModel::VectorizationCostTy
6873 LoopVectorizationCostModel::expectedCost(ElementCount VF) {
6874   VectorizationCostTy Cost;
6875 
6876   // For each block.
6877   for (BasicBlock *BB : TheLoop->blocks()) {
6878     VectorizationCostTy BlockCost;
6879 
6880     // For each instruction in the old loop.
6881     for (Instruction &I : BB->instructionsWithoutDebug()) {
6882       // Skip ignored values.
6883       if (ValuesToIgnore.count(&I) ||
6884           (VF.isVector() && VecValuesToIgnore.count(&I)))
6885         continue;
6886 
6887       VectorizationCostTy C = getInstructionCost(&I, VF);
6888 
6889       // Check if we should override the cost.
6890       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
6891         C.first = InstructionCost(ForceTargetInstructionCost);
6892 
6893       BlockCost.first += C.first;
6894       BlockCost.second |= C.second;
6895       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6896                         << " for VF " << VF << " For instruction: " << I
6897                         << '\n');
6898     }
6899 
6900     // If we are vectorizing a predicated block, it will have been
6901     // if-converted. This means that the block's instructions (aside from
6902     // stores and instructions that may divide by zero) will now be
6903     // unconditionally executed. For the scalar case, we may not always execute
6904     // the predicated block, if it is an if-else block. Thus, scale the block's
6905     // cost by the probability of executing it. blockNeedsPredication from
6906     // Legal is used so as to not include all blocks in tail folded loops.
6907     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
6908       BlockCost.first /= getReciprocalPredBlockProb();
6909 
6910     Cost.first += BlockCost.first;
6911     Cost.second |= BlockCost.second;
6912   }
6913 
6914   return Cost;
6915 }
6916 
6917 /// Gets Address Access SCEV after verifying that the access pattern
6918 /// is loop invariant except the induction variable dependence.
6919 ///
6920 /// This SCEV can be sent to the Target in order to estimate the address
6921 /// calculation cost.
6922 static const SCEV *getAddressAccessSCEV(
6923               Value *Ptr,
6924               LoopVectorizationLegality *Legal,
6925               PredicatedScalarEvolution &PSE,
6926               const Loop *TheLoop) {
6927 
6928   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6929   if (!Gep)
6930     return nullptr;
6931 
6932   // We are looking for a gep with all loop invariant indices except for one
6933   // which should be an induction variable.
6934   auto SE = PSE.getSE();
6935   unsigned NumOperands = Gep->getNumOperands();
6936   for (unsigned i = 1; i < NumOperands; ++i) {
6937     Value *Opd = Gep->getOperand(i);
6938     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6939         !Legal->isInductionVariable(Opd))
6940       return nullptr;
6941   }
6942 
6943   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6944   return PSE.getSCEV(Ptr);
6945 }
6946 
6947 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6948   return Legal->hasStride(I->getOperand(0)) ||
6949          Legal->hasStride(I->getOperand(1));
6950 }
6951 
6952 InstructionCost
6953 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6954                                                         ElementCount VF) {
6955   assert(VF.isVector() &&
6956          "Scalarization cost of instruction implies vectorization.");
6957   if (VF.isScalable())
6958     return InstructionCost::getInvalid();
6959 
6960   Type *ValTy = getLoadStoreType(I);
6961   auto SE = PSE.getSE();
6962 
6963   unsigned AS = getLoadStoreAddressSpace(I);
6964   Value *Ptr = getLoadStorePointerOperand(I);
6965   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6966 
6967   // Figure out whether the access is strided and get the stride value
6968   // if it's known in compile time
6969   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6970 
6971   // Get the cost of the scalar memory instruction and address computation.
6972   InstructionCost Cost =
6973       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6974 
6975   // Don't pass *I here, since it is scalar but will actually be part of a
6976   // vectorized loop where the user of it is a vectorized instruction.
6977   const Align Alignment = getLoadStoreAlignment(I);
6978   Cost += VF.getKnownMinValue() *
6979           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
6980                               AS, TTI::TCK_RecipThroughput);
6981 
6982   // Get the overhead of the extractelement and insertelement instructions
6983   // we might create due to scalarization.
6984   Cost += getScalarizationOverhead(I, VF);
6985 
6986   // If we have a predicated load/store, it will need extra i1 extracts and
6987   // conditional branches, but may not be executed for each vector lane. Scale
6988   // the cost by the probability of executing the predicated block.
6989   if (isPredicatedInst(I)) {
6990     Cost /= getReciprocalPredBlockProb();
6991 
6992     // Add the cost of an i1 extract and a branch
6993     auto *Vec_i1Ty =
6994         VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
6995     Cost += TTI.getScalarizationOverhead(
6996         Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()),
6997         /*Insert=*/false, /*Extract=*/true);
6998     Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput);
6999 
7000     if (useEmulatedMaskMemRefHack(I))
7001       // Artificially setting to a high enough value to practically disable
7002       // vectorization with such operations.
7003       Cost = 3000000;
7004   }
7005 
7006   return Cost;
7007 }
7008 
7009 InstructionCost
7010 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
7011                                                     ElementCount VF) {
7012   Type *ValTy = getLoadStoreType(I);
7013   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
7014   Value *Ptr = getLoadStorePointerOperand(I);
7015   unsigned AS = getLoadStoreAddressSpace(I);
7016   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
7017   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7018 
7019   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
7020          "Stride should be 1 or -1 for consecutive memory access");
7021   const Align Alignment = getLoadStoreAlignment(I);
7022   InstructionCost Cost = 0;
7023   if (Legal->isMaskRequired(I))
7024     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
7025                                       CostKind);
7026   else
7027     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
7028                                 CostKind, I);
7029 
7030   bool Reverse = ConsecutiveStride < 0;
7031   if (Reverse)
7032     Cost +=
7033         TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
7034   return Cost;
7035 }
7036 
7037 InstructionCost
7038 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
7039                                                 ElementCount VF) {
7040   assert(Legal->isUniformMemOp(*I));
7041 
7042   Type *ValTy = getLoadStoreType(I);
7043   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
7044   const Align Alignment = getLoadStoreAlignment(I);
7045   unsigned AS = getLoadStoreAddressSpace(I);
7046   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7047   if (isa<LoadInst>(I)) {
7048     return TTI.getAddressComputationCost(ValTy) +
7049            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
7050                                CostKind) +
7051            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
7052   }
7053   StoreInst *SI = cast<StoreInst>(I);
7054 
7055   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
7056   return TTI.getAddressComputationCost(ValTy) +
7057          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
7058                              CostKind) +
7059          (isLoopInvariantStoreValue
7060               ? 0
7061               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
7062                                        VF.getKnownMinValue() - 1));
7063 }
7064 
7065 InstructionCost
7066 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
7067                                                  ElementCount VF) {
7068   Type *ValTy = getLoadStoreType(I);
7069   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
7070   const Align Alignment = getLoadStoreAlignment(I);
7071   const Value *Ptr = getLoadStorePointerOperand(I);
7072 
7073   return TTI.getAddressComputationCost(VectorTy) +
7074          TTI.getGatherScatterOpCost(
7075              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
7076              TargetTransformInfo::TCK_RecipThroughput, I);
7077 }
7078 
7079 InstructionCost
7080 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
7081                                                    ElementCount VF) {
7082   // TODO: Once we have support for interleaving with scalable vectors
7083   // we can calculate the cost properly here.
7084   if (VF.isScalable())
7085     return InstructionCost::getInvalid();
7086 
7087   Type *ValTy = getLoadStoreType(I);
7088   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
7089   unsigned AS = getLoadStoreAddressSpace(I);
7090 
7091   auto Group = getInterleavedAccessGroup(I);
7092   assert(Group && "Fail to get an interleaved access group.");
7093 
7094   unsigned InterleaveFactor = Group->getFactor();
7095   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
7096 
7097   // Holds the indices of existing members in an interleaved load group.
7098   // An interleaved store group doesn't need this as it doesn't allow gaps.
7099   SmallVector<unsigned, 4> Indices;
7100   if (isa<LoadInst>(I)) {
7101     for (unsigned i = 0; i < InterleaveFactor; i++)
7102       if (Group->getMember(i))
7103         Indices.push_back(i);
7104   }
7105 
7106   // Calculate the cost of the whole interleaved group.
7107   bool UseMaskForGaps =
7108       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
7109   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
7110       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
7111       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
7112 
7113   if (Group->isReverse()) {
7114     // TODO: Add support for reversed masked interleaved access.
7115     assert(!Legal->isMaskRequired(I) &&
7116            "Reverse masked interleaved access not supported.");
7117     Cost +=
7118         Group->getNumMembers() *
7119         TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
7120   }
7121   return Cost;
7122 }
7123 
7124 InstructionCost LoopVectorizationCostModel::getReductionPatternCost(
7125     Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) {
7126   // Early exit for no inloop reductions
7127   if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty))
7128     return InstructionCost::getInvalid();
7129   auto *VectorTy = cast<VectorType>(Ty);
7130 
7131   // We are looking for a pattern of, and finding the minimal acceptable cost:
7132   //  reduce(mul(ext(A), ext(B))) or
7133   //  reduce(mul(A, B)) or
7134   //  reduce(ext(A)) or
7135   //  reduce(A).
7136   // The basic idea is that we walk down the tree to do that, finding the root
7137   // reduction instruction in InLoopReductionImmediateChains. From there we find
7138   // the pattern of mul/ext and test the cost of the entire pattern vs the cost
7139   // of the components. If the reduction cost is lower then we return it for the
7140   // reduction instruction and 0 for the other instructions in the pattern. If
7141   // it is not we return an invalid cost specifying the orignal cost method
7142   // should be used.
7143   Instruction *RetI = I;
7144   if ((RetI->getOpcode() == Instruction::SExt ||
7145        RetI->getOpcode() == Instruction::ZExt)) {
7146     if (!RetI->hasOneUser())
7147       return InstructionCost::getInvalid();
7148     RetI = RetI->user_back();
7149   }
7150   if (RetI->getOpcode() == Instruction::Mul &&
7151       RetI->user_back()->getOpcode() == Instruction::Add) {
7152     if (!RetI->hasOneUser())
7153       return InstructionCost::getInvalid();
7154     RetI = RetI->user_back();
7155   }
7156 
7157   // Test if the found instruction is a reduction, and if not return an invalid
7158   // cost specifying the parent to use the original cost modelling.
7159   if (!InLoopReductionImmediateChains.count(RetI))
7160     return InstructionCost::getInvalid();
7161 
7162   // Find the reduction this chain is a part of and calculate the basic cost of
7163   // the reduction on its own.
7164   Instruction *LastChain = InLoopReductionImmediateChains[RetI];
7165   Instruction *ReductionPhi = LastChain;
7166   while (!isa<PHINode>(ReductionPhi))
7167     ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];
7168 
7169   const RecurrenceDescriptor &RdxDesc =
7170       Legal->getReductionVars()[cast<PHINode>(ReductionPhi)];
7171   InstructionCost BaseCost = TTI.getArithmeticReductionCost(
7172       RdxDesc.getOpcode(), VectorTy, false, CostKind);
7173 
7174   // Get the operand that was not the reduction chain and match it to one of the
7175   // patterns, returning the better cost if it is found.
7176   Instruction *RedOp = RetI->getOperand(1) == LastChain
7177                            ? dyn_cast<Instruction>(RetI->getOperand(0))
7178                            : dyn_cast<Instruction>(RetI->getOperand(1));
7179 
7180   VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
7181 
7182   if (RedOp && (isa<SExtInst>(RedOp) || isa<ZExtInst>(RedOp)) &&
7183       !TheLoop->isLoopInvariant(RedOp)) {
7184     bool IsUnsigned = isa<ZExtInst>(RedOp);
7185     auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
7186     InstructionCost RedCost = TTI.getExtendedAddReductionCost(
7187         /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
7188         CostKind);
7189 
7190     InstructionCost ExtCost =
7191         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
7192                              TTI::CastContextHint::None, CostKind, RedOp);
7193     if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
7194       return I == RetI ? *RedCost.getValue() : 0;
7195   } else if (RedOp && RedOp->getOpcode() == Instruction::Mul) {
7196     Instruction *Mul = RedOp;
7197     Instruction *Op0 = dyn_cast<Instruction>(Mul->getOperand(0));
7198     Instruction *Op1 = dyn_cast<Instruction>(Mul->getOperand(1));
7199     if (Op0 && Op1 && (isa<SExtInst>(Op0) || isa<ZExtInst>(Op0)) &&
7200         Op0->getOpcode() == Op1->getOpcode() &&
7201         Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
7202         !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
7203       bool IsUnsigned = isa<ZExtInst>(Op0);
7204       auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
7205       // reduce(mul(ext, ext))
7206       InstructionCost ExtCost =
7207           TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType,
7208                                TTI::CastContextHint::None, CostKind, Op0);
7209       InstructionCost MulCost =
7210           TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind);
7211 
7212       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
7213           /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
7214           CostKind);
7215 
7216       if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost)
7217         return I == RetI ? *RedCost.getValue() : 0;
7218     } else {
7219       InstructionCost MulCost =
7220           TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind);
7221 
7222       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
7223           /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy,
7224           CostKind);
7225 
7226       if (RedCost.isValid() && RedCost < MulCost + BaseCost)
7227         return I == RetI ? *RedCost.getValue() : 0;
7228     }
7229   }
7230 
7231   return I == RetI ? BaseCost : InstructionCost::getInvalid();
7232 }
7233 
7234 InstructionCost
7235 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
7236                                                      ElementCount VF) {
7237   // Calculate scalar cost only. Vectorization cost should be ready at this
7238   // moment.
7239   if (VF.isScalar()) {
7240     Type *ValTy = getLoadStoreType(I);
7241     const Align Alignment = getLoadStoreAlignment(I);
7242     unsigned AS = getLoadStoreAddressSpace(I);
7243 
7244     return TTI.getAddressComputationCost(ValTy) +
7245            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
7246                                TTI::TCK_RecipThroughput, I);
7247   }
7248   return getWideningCost(I, VF);
7249 }
7250 
7251 LoopVectorizationCostModel::VectorizationCostTy
7252 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
7253                                                ElementCount VF) {
7254   // If we know that this instruction will remain uniform, check the cost of
7255   // the scalar version.
7256   if (isUniformAfterVectorization(I, VF))
7257     VF = ElementCount::getFixed(1);
7258 
7259   if (VF.isVector() && isProfitableToScalarize(I, VF))
7260     return VectorizationCostTy(InstsToScalarize[VF][I], false);
7261 
7262   // Forced scalars do not have any scalarization overhead.
7263   auto ForcedScalar = ForcedScalars.find(VF);
7264   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
7265     auto InstSet = ForcedScalar->second;
7266     if (InstSet.count(I))
7267       return VectorizationCostTy(
7268           (getInstructionCost(I, ElementCount::getFixed(1)).first *
7269            VF.getKnownMinValue()),
7270           false);
7271   }
7272 
7273   Type *VectorTy;
7274   InstructionCost C = getInstructionCost(I, VF, VectorTy);
7275 
7276   bool TypeNotScalarized =
7277       VF.isVector() && VectorTy->isVectorTy() &&
7278       TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue();
7279   return VectorizationCostTy(C, TypeNotScalarized);
7280 }
7281 
7282 InstructionCost
7283 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
7284                                                      ElementCount VF) const {
7285 
7286   if (VF.isScalable())
7287     return InstructionCost::getInvalid();
7288 
7289   if (VF.isScalar())
7290     return 0;
7291 
7292   InstructionCost Cost = 0;
7293   Type *RetTy = ToVectorTy(I->getType(), VF);
7294   if (!RetTy->isVoidTy() &&
7295       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
7296     Cost += TTI.getScalarizationOverhead(
7297         cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()),
7298         true, false);
7299 
7300   // Some targets keep addresses scalar.
7301   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
7302     return Cost;
7303 
7304   // Some targets support efficient element stores.
7305   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
7306     return Cost;
7307 
7308   // Collect operands to consider.
7309   CallInst *CI = dyn_cast<CallInst>(I);
7310   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
7311 
7312   // Skip operands that do not require extraction/scalarization and do not incur
7313   // any overhead.
7314   SmallVector<Type *> Tys;
7315   for (auto *V : filterExtractingOperands(Ops, VF))
7316     Tys.push_back(MaybeVectorizeType(V->getType(), VF));
7317   return Cost + TTI.getOperandsScalarizationOverhead(
7318                     filterExtractingOperands(Ops, VF), Tys);
7319 }
7320 
7321 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
7322   if (VF.isScalar())
7323     return;
7324   NumPredStores = 0;
7325   for (BasicBlock *BB : TheLoop->blocks()) {
7326     // For each instruction in the old loop.
7327     for (Instruction &I : *BB) {
7328       Value *Ptr =  getLoadStorePointerOperand(&I);
7329       if (!Ptr)
7330         continue;
7331 
7332       // TODO: We should generate better code and update the cost model for
7333       // predicated uniform stores. Today they are treated as any other
7334       // predicated store (see added test cases in
7335       // invariant-store-vectorization.ll).
7336       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
7337         NumPredStores++;
7338 
7339       if (Legal->isUniformMemOp(I)) {
7340         // TODO: Avoid replicating loads and stores instead of
7341         // relying on instcombine to remove them.
7342         // Load: Scalar load + broadcast
7343         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
7344         InstructionCost Cost;
7345         if (isa<StoreInst>(&I) && VF.isScalable() &&
7346             isLegalGatherOrScatter(&I)) {
7347           Cost = getGatherScatterCost(&I, VF);
7348           setWideningDecision(&I, VF, CM_GatherScatter, Cost);
7349         } else {
7350           assert((isa<LoadInst>(&I) || !VF.isScalable()) &&
7351                  "Cannot yet scalarize uniform stores");
7352           Cost = getUniformMemOpCost(&I, VF);
7353           setWideningDecision(&I, VF, CM_Scalarize, Cost);
7354         }
7355         continue;
7356       }
7357 
7358       // We assume that widening is the best solution when possible.
7359       if (memoryInstructionCanBeWidened(&I, VF)) {
7360         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
7361         int ConsecutiveStride =
7362                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
7363         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
7364                "Expected consecutive stride.");
7365         InstWidening Decision =
7366             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
7367         setWideningDecision(&I, VF, Decision, Cost);
7368         continue;
7369       }
7370 
7371       // Choose between Interleaving, Gather/Scatter or Scalarization.
7372       InstructionCost InterleaveCost = InstructionCost::getInvalid();
7373       unsigned NumAccesses = 1;
7374       if (isAccessInterleaved(&I)) {
7375         auto Group = getInterleavedAccessGroup(&I);
7376         assert(Group && "Fail to get an interleaved access group.");
7377 
7378         // Make one decision for the whole group.
7379         if (getWideningDecision(&I, VF) != CM_Unknown)
7380           continue;
7381 
7382         NumAccesses = Group->getNumMembers();
7383         if (interleavedAccessCanBeWidened(&I, VF))
7384           InterleaveCost = getInterleaveGroupCost(&I, VF);
7385       }
7386 
7387       InstructionCost GatherScatterCost =
7388           isLegalGatherOrScatter(&I)
7389               ? getGatherScatterCost(&I, VF) * NumAccesses
7390               : InstructionCost::getInvalid();
7391 
7392       InstructionCost ScalarizationCost =
7393           getMemInstScalarizationCost(&I, VF) * NumAccesses;
7394 
7395       // Choose better solution for the current VF,
7396       // write down this decision and use it during vectorization.
7397       InstructionCost Cost;
7398       InstWidening Decision;
7399       if (InterleaveCost <= GatherScatterCost &&
7400           InterleaveCost < ScalarizationCost) {
7401         Decision = CM_Interleave;
7402         Cost = InterleaveCost;
7403       } else if (GatherScatterCost < ScalarizationCost) {
7404         Decision = CM_GatherScatter;
7405         Cost = GatherScatterCost;
7406       } else {
7407         assert(!VF.isScalable() &&
7408                "We cannot yet scalarise for scalable vectors");
7409         Decision = CM_Scalarize;
7410         Cost = ScalarizationCost;
7411       }
7412       // If the instructions belongs to an interleave group, the whole group
7413       // receives the same decision. The whole group receives the cost, but
7414       // the cost will actually be assigned to one instruction.
7415       if (auto Group = getInterleavedAccessGroup(&I))
7416         setWideningDecision(Group, VF, Decision, Cost);
7417       else
7418         setWideningDecision(&I, VF, Decision, Cost);
7419     }
7420   }
7421 
7422   // Make sure that any load of address and any other address computation
7423   // remains scalar unless there is gather/scatter support. This avoids
7424   // inevitable extracts into address registers, and also has the benefit of
7425   // activating LSR more, since that pass can't optimize vectorized
7426   // addresses.
7427   if (TTI.prefersVectorizedAddressing())
7428     return;
7429 
7430   // Start with all scalar pointer uses.
7431   SmallPtrSet<Instruction *, 8> AddrDefs;
7432   for (BasicBlock *BB : TheLoop->blocks())
7433     for (Instruction &I : *BB) {
7434       Instruction *PtrDef =
7435         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
7436       if (PtrDef && TheLoop->contains(PtrDef) &&
7437           getWideningDecision(&I, VF) != CM_GatherScatter)
7438         AddrDefs.insert(PtrDef);
7439     }
7440 
7441   // Add all instructions used to generate the addresses.
7442   SmallVector<Instruction *, 4> Worklist;
7443   append_range(Worklist, AddrDefs);
7444   while (!Worklist.empty()) {
7445     Instruction *I = Worklist.pop_back_val();
7446     for (auto &Op : I->operands())
7447       if (auto *InstOp = dyn_cast<Instruction>(Op))
7448         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
7449             AddrDefs.insert(InstOp).second)
7450           Worklist.push_back(InstOp);
7451   }
7452 
7453   for (auto *I : AddrDefs) {
7454     if (isa<LoadInst>(I)) {
7455       // Setting the desired widening decision should ideally be handled in
7456       // by cost functions, but since this involves the task of finding out
7457       // if the loaded register is involved in an address computation, it is
7458       // instead changed here when we know this is the case.
7459       InstWidening Decision = getWideningDecision(I, VF);
7460       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
7461         // Scalarize a widened load of address.
7462         setWideningDecision(
7463             I, VF, CM_Scalarize,
7464             (VF.getKnownMinValue() *
7465              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
7466       else if (auto Group = getInterleavedAccessGroup(I)) {
7467         // Scalarize an interleave group of address loads.
7468         for (unsigned I = 0; I < Group->getFactor(); ++I) {
7469           if (Instruction *Member = Group->getMember(I))
7470             setWideningDecision(
7471                 Member, VF, CM_Scalarize,
7472                 (VF.getKnownMinValue() *
7473                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
7474         }
7475       }
7476     } else
7477       // Make sure I gets scalarized and a cost estimate without
7478       // scalarization overhead.
7479       ForcedScalars[VF].insert(I);
7480   }
7481 }
7482 
7483 InstructionCost
7484 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
7485                                                Type *&VectorTy) {
7486   Type *RetTy = I->getType();
7487   if (canTruncateToMinimalBitwidth(I, VF))
7488     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
7489   auto SE = PSE.getSE();
7490   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7491 
7492   auto hasSingleCopyAfterVectorization = [this](Instruction *I,
7493                                                 ElementCount VF) -> bool {
7494     if (VF.isScalar())
7495       return true;
7496 
7497     auto Scalarized = InstsToScalarize.find(VF);
7498     assert(Scalarized != InstsToScalarize.end() &&
7499            "VF not yet analyzed for scalarization profitability");
7500     return !Scalarized->second.count(I) &&
7501            llvm::all_of(I->users(), [&](User *U) {
7502              auto *UI = cast<Instruction>(U);
7503              return !Scalarized->second.count(UI);
7504            });
7505   };
7506   (void) hasSingleCopyAfterVectorization;
7507 
7508   if (isScalarAfterVectorization(I, VF)) {
7509     // With the exception of GEPs and PHIs, after scalarization there should
7510     // only be one copy of the instruction generated in the loop. This is
7511     // because the VF is either 1, or any instructions that need scalarizing
7512     // have already been dealt with by the the time we get here. As a result,
7513     // it means we don't have to multiply the instruction cost by VF.
7514     assert(I->getOpcode() == Instruction::GetElementPtr ||
7515            I->getOpcode() == Instruction::PHI ||
7516            (I->getOpcode() == Instruction::BitCast &&
7517             I->getType()->isPointerTy()) ||
7518            hasSingleCopyAfterVectorization(I, VF));
7519     VectorTy = RetTy;
7520   } else
7521     VectorTy = ToVectorTy(RetTy, VF);
7522 
7523   // TODO: We need to estimate the cost of intrinsic calls.
7524   switch (I->getOpcode()) {
7525   case Instruction::GetElementPtr:
7526     // We mark this instruction as zero-cost because the cost of GEPs in
7527     // vectorized code depends on whether the corresponding memory instruction
7528     // is scalarized or not. Therefore, we handle GEPs with the memory
7529     // instruction cost.
7530     return 0;
7531   case Instruction::Br: {
7532     // In cases of scalarized and predicated instructions, there will be VF
7533     // predicated blocks in the vectorized loop. Each branch around these
7534     // blocks requires also an extract of its vector compare i1 element.
7535     bool ScalarPredicatedBB = false;
7536     BranchInst *BI = cast<BranchInst>(I);
7537     if (VF.isVector() && BI->isConditional() &&
7538         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
7539          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
7540       ScalarPredicatedBB = true;
7541 
7542     if (ScalarPredicatedBB) {
7543       // Return cost for branches around scalarized and predicated blocks.
7544       assert(!VF.isScalable() && "scalable vectors not yet supported.");
7545       auto *Vec_i1Ty =
7546           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
7547       return (TTI.getScalarizationOverhead(
7548                   Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()),
7549                   false, true) +
7550               (TTI.getCFInstrCost(Instruction::Br, CostKind) *
7551                VF.getKnownMinValue()));
7552     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
7553       // The back-edge branch will remain, as will all scalar branches.
7554       return TTI.getCFInstrCost(Instruction::Br, CostKind);
7555     else
7556       // This branch will be eliminated by if-conversion.
7557       return 0;
7558     // Note: We currently assume zero cost for an unconditional branch inside
7559     // a predicated block since it will become a fall-through, although we
7560     // may decide in the future to call TTI for all branches.
7561   }
7562   case Instruction::PHI: {
7563     auto *Phi = cast<PHINode>(I);
7564 
7565     // First-order recurrences are replaced by vector shuffles inside the loop.
7566     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
7567     if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
7568       return TTI.getShuffleCost(
7569           TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
7570           None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
7571 
7572     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
7573     // converted into select instructions. We require N - 1 selects per phi
7574     // node, where N is the number of incoming values.
7575     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
7576       return (Phi->getNumIncomingValues() - 1) *
7577              TTI.getCmpSelInstrCost(
7578                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
7579                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
7580                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
7581 
7582     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
7583   }
7584   case Instruction::UDiv:
7585   case Instruction::SDiv:
7586   case Instruction::URem:
7587   case Instruction::SRem:
7588     // If we have a predicated instruction, it may not be executed for each
7589     // vector lane. Get the scalarization cost and scale this amount by the
7590     // probability of executing the predicated block. If the instruction is not
7591     // predicated, we fall through to the next case.
7592     if (VF.isVector() && isScalarWithPredication(I)) {
7593       InstructionCost Cost = 0;
7594 
7595       // These instructions have a non-void type, so account for the phi nodes
7596       // that we will create. This cost is likely to be zero. The phi node
7597       // cost, if any, should be scaled by the block probability because it
7598       // models a copy at the end of each predicated block.
7599       Cost += VF.getKnownMinValue() *
7600               TTI.getCFInstrCost(Instruction::PHI, CostKind);
7601 
7602       // The cost of the non-predicated instruction.
7603       Cost += VF.getKnownMinValue() *
7604               TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
7605 
7606       // The cost of insertelement and extractelement instructions needed for
7607       // scalarization.
7608       Cost += getScalarizationOverhead(I, VF);
7609 
7610       // Scale the cost by the probability of executing the predicated blocks.
7611       // This assumes the predicated block for each vector lane is equally
7612       // likely.
7613       return Cost / getReciprocalPredBlockProb();
7614     }
7615     LLVM_FALLTHROUGH;
7616   case Instruction::Add:
7617   case Instruction::FAdd:
7618   case Instruction::Sub:
7619   case Instruction::FSub:
7620   case Instruction::Mul:
7621   case Instruction::FMul:
7622   case Instruction::FDiv:
7623   case Instruction::FRem:
7624   case Instruction::Shl:
7625   case Instruction::LShr:
7626   case Instruction::AShr:
7627   case Instruction::And:
7628   case Instruction::Or:
7629   case Instruction::Xor: {
7630     // Since we will replace the stride by 1 the multiplication should go away.
7631     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7632       return 0;
7633 
7634     // Detect reduction patterns
7635     InstructionCost RedCost;
7636     if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7637             .isValid())
7638       return RedCost;
7639 
7640     // Certain instructions can be cheaper to vectorize if they have a constant
7641     // second vector operand. One example of this are shifts on x86.
7642     Value *Op2 = I->getOperand(1);
7643     TargetTransformInfo::OperandValueProperties Op2VP;
7644     TargetTransformInfo::OperandValueKind Op2VK =
7645         TTI.getOperandInfo(Op2, Op2VP);
7646     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
7647       Op2VK = TargetTransformInfo::OK_UniformValue;
7648 
7649     SmallVector<const Value *, 4> Operands(I->operand_values());
7650     return TTI.getArithmeticInstrCost(
7651         I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7652         Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
7653   }
7654   case Instruction::FNeg: {
7655     return TTI.getArithmeticInstrCost(
7656         I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7657         TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None,
7658         TargetTransformInfo::OP_None, I->getOperand(0), I);
7659   }
7660   case Instruction::Select: {
7661     SelectInst *SI = cast<SelectInst>(I);
7662     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7663     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7664 
7665     const Value *Op0, *Op1;
7666     using namespace llvm::PatternMatch;
7667     if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
7668                         match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
7669       // select x, y, false --> x & y
7670       // select x, true, y --> x | y
7671       TTI::OperandValueProperties Op1VP = TTI::OP_None;
7672       TTI::OperandValueProperties Op2VP = TTI::OP_None;
7673       TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP);
7674       TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP);
7675       assert(Op0->getType()->getScalarSizeInBits() == 1 &&
7676               Op1->getType()->getScalarSizeInBits() == 1);
7677 
7678       SmallVector<const Value *, 2> Operands{Op0, Op1};
7679       return TTI.getArithmeticInstrCost(
7680           match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
7681           CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I);
7682     }
7683 
7684     Type *CondTy = SI->getCondition()->getType();
7685     if (!ScalarCond)
7686       CondTy = VectorType::get(CondTy, VF);
7687     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
7688                                   CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
7689   }
7690   case Instruction::ICmp:
7691   case Instruction::FCmp: {
7692     Type *ValTy = I->getOperand(0)->getType();
7693     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7694     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7695       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7696     VectorTy = ToVectorTy(ValTy, VF);
7697     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7698                                   CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
7699   }
7700   case Instruction::Store:
7701   case Instruction::Load: {
7702     ElementCount Width = VF;
7703     if (Width.isVector()) {
7704       InstWidening Decision = getWideningDecision(I, Width);
7705       assert(Decision != CM_Unknown &&
7706              "CM decision should be taken at this point");
7707       if (Decision == CM_Scalarize)
7708         Width = ElementCount::getFixed(1);
7709     }
7710     VectorTy = ToVectorTy(getLoadStoreType(I), Width);
7711     return getMemoryInstructionCost(I, VF);
7712   }
7713   case Instruction::BitCast:
7714     if (I->getType()->isPointerTy())
7715       return 0;
7716     LLVM_FALLTHROUGH;
7717   case Instruction::ZExt:
7718   case Instruction::SExt:
7719   case Instruction::FPToUI:
7720   case Instruction::FPToSI:
7721   case Instruction::FPExt:
7722   case Instruction::PtrToInt:
7723   case Instruction::IntToPtr:
7724   case Instruction::SIToFP:
7725   case Instruction::UIToFP:
7726   case Instruction::Trunc:
7727   case Instruction::FPTrunc: {
7728     // Computes the CastContextHint from a Load/Store instruction.
7729     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7730       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7731              "Expected a load or a store!");
7732 
7733       if (VF.isScalar() || !TheLoop->contains(I))
7734         return TTI::CastContextHint::Normal;
7735 
7736       switch (getWideningDecision(I, VF)) {
7737       case LoopVectorizationCostModel::CM_GatherScatter:
7738         return TTI::CastContextHint::GatherScatter;
7739       case LoopVectorizationCostModel::CM_Interleave:
7740         return TTI::CastContextHint::Interleave;
7741       case LoopVectorizationCostModel::CM_Scalarize:
7742       case LoopVectorizationCostModel::CM_Widen:
7743         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7744                                         : TTI::CastContextHint::Normal;
7745       case LoopVectorizationCostModel::CM_Widen_Reverse:
7746         return TTI::CastContextHint::Reversed;
7747       case LoopVectorizationCostModel::CM_Unknown:
7748         llvm_unreachable("Instr did not go through cost modelling?");
7749       }
7750 
7751       llvm_unreachable("Unhandled case!");
7752     };
7753 
7754     unsigned Opcode = I->getOpcode();
7755     TTI::CastContextHint CCH = TTI::CastContextHint::None;
7756     // For Trunc, the context is the only user, which must be a StoreInst.
7757     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7758       if (I->hasOneUse())
7759         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7760           CCH = ComputeCCH(Store);
7761     }
7762     // For Z/Sext, the context is the operand, which must be a LoadInst.
7763     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7764              Opcode == Instruction::FPExt) {
7765       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7766         CCH = ComputeCCH(Load);
7767     }
7768 
7769     // We optimize the truncation of induction variables having constant
7770     // integer steps. The cost of these truncations is the same as the scalar
7771     // operation.
7772     if (isOptimizableIVTruncate(I, VF)) {
7773       auto *Trunc = cast<TruncInst>(I);
7774       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7775                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
7776     }
7777 
7778     // Detect reduction patterns
7779     InstructionCost RedCost;
7780     if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7781             .isValid())
7782       return RedCost;
7783 
7784     Type *SrcScalarTy = I->getOperand(0)->getType();
7785     Type *SrcVecTy =
7786         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7787     if (canTruncateToMinimalBitwidth(I, VF)) {
7788       // This cast is going to be shrunk. This may remove the cast or it might
7789       // turn it into slightly different cast. For example, if MinBW == 16,
7790       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7791       //
7792       // Calculate the modified src and dest types.
7793       Type *MinVecTy = VectorTy;
7794       if (Opcode == Instruction::Trunc) {
7795         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7796         VectorTy =
7797             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7798       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7799         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7800         VectorTy =
7801             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7802       }
7803     }
7804 
7805     return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7806   }
7807   case Instruction::Call: {
7808     bool NeedToScalarize;
7809     CallInst *CI = cast<CallInst>(I);
7810     InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
7811     if (getVectorIntrinsicIDForCall(CI, TLI)) {
7812       InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
7813       return std::min(CallCost, IntrinsicCost);
7814     }
7815     return CallCost;
7816   }
7817   case Instruction::ExtractValue:
7818     return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7819   default:
7820     // This opcode is unknown. Assume that it is the same as 'mul'.
7821     return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7822   } // end of switch.
7823 }
7824 
7825 char LoopVectorize::ID = 0;
7826 
7827 static const char lv_name[] = "Loop Vectorization";
7828 
7829 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
7830 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
7831 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
7832 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
7833 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
7834 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
7835 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
7836 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
7837 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
7838 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
7839 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
7840 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
7841 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
7842 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
7843 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
7844 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
7845 
7846 namespace llvm {
7847 
7848 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
7849 
7850 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
7851                               bool VectorizeOnlyWhenForced) {
7852   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
7853 }
7854 
7855 } // end namespace llvm
7856 
7857 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
7858   // Check if the pointer operand of a load or store instruction is
7859   // consecutive.
7860   if (auto *Ptr = getLoadStorePointerOperand(Inst))
7861     return Legal->isConsecutivePtr(Ptr);
7862   return false;
7863 }
7864 
7865 void LoopVectorizationCostModel::collectValuesToIgnore() {
7866   // Ignore ephemeral values.
7867   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7868 
7869   // Ignore type-promoting instructions we identified during reduction
7870   // detection.
7871   for (auto &Reduction : Legal->getReductionVars()) {
7872     RecurrenceDescriptor &RedDes = Reduction.second;
7873     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7874     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7875   }
7876   // Ignore type-casting instructions we identified during induction
7877   // detection.
7878   for (auto &Induction : Legal->getInductionVars()) {
7879     InductionDescriptor &IndDes = Induction.second;
7880     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7881     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7882   }
7883 }
7884 
7885 void LoopVectorizationCostModel::collectInLoopReductions() {
7886   for (auto &Reduction : Legal->getReductionVars()) {
7887     PHINode *Phi = Reduction.first;
7888     RecurrenceDescriptor &RdxDesc = Reduction.second;
7889 
7890     // We don't collect reductions that are type promoted (yet).
7891     if (RdxDesc.getRecurrenceType() != Phi->getType())
7892       continue;
7893 
7894     // If the target would prefer this reduction to happen "in-loop", then we
7895     // want to record it as such.
7896     unsigned Opcode = RdxDesc.getOpcode();
7897     if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7898         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7899                                    TargetTransformInfo::ReductionFlags()))
7900       continue;
7901 
7902     // Check that we can correctly put the reductions into the loop, by
7903     // finding the chain of operations that leads from the phi to the loop
7904     // exit value.
7905     SmallVector<Instruction *, 4> ReductionOperations =
7906         RdxDesc.getReductionOpChain(Phi, TheLoop);
7907     bool InLoop = !ReductionOperations.empty();
7908     if (InLoop) {
7909       InLoopReductionChains[Phi] = ReductionOperations;
7910       // Add the elements to InLoopReductionImmediateChains for cost modelling.
7911       Instruction *LastChain = Phi;
7912       for (auto *I : ReductionOperations) {
7913         InLoopReductionImmediateChains[I] = LastChain;
7914         LastChain = I;
7915       }
7916     }
7917     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7918                       << " reduction for phi: " << *Phi << "\n");
7919   }
7920 }
7921 
7922 // TODO: we could return a pair of values that specify the max VF and
7923 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7924 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7925 // doesn't have a cost model that can choose which plan to execute if
7926 // more than one is generated.
7927 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
7928                                  LoopVectorizationCostModel &CM) {
7929   unsigned WidestType;
7930   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7931   return WidestVectorRegBits / WidestType;
7932 }
7933 
7934 VectorizationFactor
7935 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7936   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
7937   ElementCount VF = UserVF;
7938   // Outer loop handling: They may require CFG and instruction level
7939   // transformations before even evaluating whether vectorization is profitable.
7940   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7941   // the vectorization pipeline.
7942   if (!OrigLoop->isInnermost()) {
7943     // If the user doesn't provide a vectorization factor, determine a
7944     // reasonable one.
7945     if (UserVF.isZero()) {
7946       VF = ElementCount::getFixed(determineVPlanVF(
7947           TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
7948               .getFixedSize(),
7949           CM));
7950       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7951 
7952       // Make sure we have a VF > 1 for stress testing.
7953       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7954         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7955                           << "overriding computed VF.\n");
7956         VF = ElementCount::getFixed(4);
7957       }
7958     }
7959     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7960     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7961            "VF needs to be a power of two");
7962     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7963                       << "VF " << VF << " to build VPlans.\n");
7964     buildVPlans(VF, VF);
7965 
7966     // For VPlan build stress testing, we bail out after VPlan construction.
7967     if (VPlanBuildStressTest)
7968       return VectorizationFactor::Disabled();
7969 
7970     return {VF, 0 /*Cost*/};
7971   }
7972 
7973   LLVM_DEBUG(
7974       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7975                 "VPlan-native path.\n");
7976   return VectorizationFactor::Disabled();
7977 }
7978 
7979 Optional<VectorizationFactor>
7980 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7981   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7982   FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7983   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7984     return None;
7985 
7986   // Invalidate interleave groups if all blocks of loop will be predicated.
7987   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
7988       !useMaskedInterleavedAccesses(*TTI)) {
7989     LLVM_DEBUG(
7990         dbgs()
7991         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7992            "which requires masked-interleaved support.\n");
7993     if (CM.InterleaveInfo.invalidateGroups())
7994       // Invalidating interleave groups also requires invalidating all decisions
7995       // based on them, which includes widening decisions and uniform and scalar
7996       // values.
7997       CM.invalidateCostModelingDecisions();
7998   }
7999 
8000   ElementCount MaxUserVF =
8001       UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
8002   bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
8003   if (!UserVF.isZero() && UserVFIsLegal) {
8004     LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max")
8005                       << " VF " << UserVF << ".\n");
8006     assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
8007            "VF needs to be a power of two");
8008     // Collect the instructions (and their associated costs) that will be more
8009     // profitable to scalarize.
8010     CM.selectUserVectorizationFactor(UserVF);
8011     CM.collectInLoopReductions();
8012     buildVPlansWithVPRecipes(UserVF, UserVF);
8013     LLVM_DEBUG(printPlans(dbgs()));
8014     return {{UserVF, 0}};
8015   }
8016 
8017   // Populate the set of Vectorization Factor Candidates.
8018   ElementCountSet VFCandidates;
8019   for (auto VF = ElementCount::getFixed(1);
8020        ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
8021     VFCandidates.insert(VF);
8022   for (auto VF = ElementCount::getScalable(1);
8023        ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
8024     VFCandidates.insert(VF);
8025 
8026   for (const auto &VF : VFCandidates) {
8027     // Collect Uniform and Scalar instructions after vectorization with VF.
8028     CM.collectUniformsAndScalars(VF);
8029 
8030     // Collect the instructions (and their associated costs) that will be more
8031     // profitable to scalarize.
8032     if (VF.isVector())
8033       CM.collectInstsToScalarize(VF);
8034   }
8035 
8036   CM.collectInLoopReductions();
8037   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
8038   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
8039 
8040   LLVM_DEBUG(printPlans(dbgs()));
8041   if (!MaxFactors.hasVector())
8042     return VectorizationFactor::Disabled();
8043 
8044   // Select the optimal vectorization factor.
8045   auto SelectedVF = CM.selectVectorizationFactor(VFCandidates);
8046 
8047   // Check if it is profitable to vectorize with runtime checks.
8048   unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks();
8049   if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) {
8050     bool PragmaThresholdReached =
8051         NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
8052     bool ThresholdReached =
8053         NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
8054     if ((ThresholdReached && !Hints.allowReordering()) ||
8055         PragmaThresholdReached) {
8056       ORE->emit([&]() {
8057         return OptimizationRemarkAnalysisAliasing(
8058                    DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(),
8059                    OrigLoop->getHeader())
8060                << "loop not vectorized: cannot prove it is safe to reorder "
8061                   "memory operations";
8062       });
8063       LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
8064       Hints.emitRemarkWithHints();
8065       return VectorizationFactor::Disabled();
8066     }
8067   }
8068   return SelectedVF;
8069 }
8070 
8071 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) {
8072   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
8073                     << '\n');
8074   BestVF = VF;
8075   BestUF = UF;
8076 
8077   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
8078     return !Plan->hasVF(VF);
8079   });
8080   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
8081 }
8082 
8083 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
8084                                            DominatorTree *DT) {
8085   // Perform the actual loop transformation.
8086 
8087   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
8088   assert(BestVF.hasValue() && "Vectorization Factor is missing");
8089   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
8090 
8091   VPTransformState State{
8092       *BestVF, BestUF, LI, DT, ILV.Builder, &ILV, VPlans.front().get()};
8093   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
8094   State.TripCount = ILV.getOrCreateTripCount(nullptr);
8095   State.CanonicalIV = ILV.Induction;
8096 
8097   ILV.printDebugTracesAtStart();
8098 
8099   //===------------------------------------------------===//
8100   //
8101   // Notice: any optimization or new instruction that go
8102   // into the code below should also be implemented in
8103   // the cost-model.
8104   //
8105   //===------------------------------------------------===//
8106 
8107   // 2. Copy and widen instructions from the old loop into the new loop.
8108   VPlans.front()->execute(&State);
8109 
8110   // 3. Fix the vectorized code: take care of header phi's, live-outs,
8111   //    predication, updating analyses.
8112   ILV.fixVectorizedLoop(State);
8113 
8114   ILV.printDebugTracesAtEnd();
8115 }
8116 
8117 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
8118 void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
8119   for (const auto &Plan : VPlans)
8120     if (PrintVPlansInDotFormat)
8121       Plan->printDOT(O);
8122     else
8123       Plan->print(O);
8124 }
8125 #endif
8126 
8127 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
8128     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
8129 
8130   // We create new control-flow for the vectorized loop, so the original exit
8131   // conditions will be dead after vectorization if it's only used by the
8132   // terminator
8133   SmallVector<BasicBlock*> ExitingBlocks;
8134   OrigLoop->getExitingBlocks(ExitingBlocks);
8135   for (auto *BB : ExitingBlocks) {
8136     auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0));
8137     if (!Cmp || !Cmp->hasOneUse())
8138       continue;
8139 
8140     // TODO: we should introduce a getUniqueExitingBlocks on Loop
8141     if (!DeadInstructions.insert(Cmp).second)
8142       continue;
8143 
8144     // The operands of the icmp is often a dead trunc, used by IndUpdate.
8145     // TODO: can recurse through operands in general
8146     for (Value *Op : Cmp->operands()) {
8147       if (isa<TruncInst>(Op) && Op->hasOneUse())
8148           DeadInstructions.insert(cast<Instruction>(Op));
8149     }
8150   }
8151 
8152   // We create new "steps" for induction variable updates to which the original
8153   // induction variables map. An original update instruction will be dead if
8154   // all its users except the induction variable are dead.
8155   auto *Latch = OrigLoop->getLoopLatch();
8156   for (auto &Induction : Legal->getInductionVars()) {
8157     PHINode *Ind = Induction.first;
8158     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
8159 
8160     // If the tail is to be folded by masking, the primary induction variable,
8161     // if exists, isn't dead: it will be used for masking. Don't kill it.
8162     if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())
8163       continue;
8164 
8165     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
8166           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
8167         }))
8168       DeadInstructions.insert(IndUpdate);
8169 
8170     // We record as "Dead" also the type-casting instructions we had identified
8171     // during induction analysis. We don't need any handling for them in the
8172     // vectorized loop because we have proven that, under a proper runtime
8173     // test guarding the vectorized loop, the value of the phi, and the casted
8174     // value of the phi, are the same. The last instruction in this casting chain
8175     // will get its scalar/vector/widened def from the scalar/vector/widened def
8176     // of the respective phi node. Any other casts in the induction def-use chain
8177     // have no other uses outside the phi update chain, and will be ignored.
8178     InductionDescriptor &IndDes = Induction.second;
8179     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
8180     DeadInstructions.insert(Casts.begin(), Casts.end());
8181   }
8182 }
8183 
8184 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
8185 
8186 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
8187 
8188 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
8189                                         Instruction::BinaryOps BinOp) {
8190   // When unrolling and the VF is 1, we only need to add a simple scalar.
8191   Type *Ty = Val->getType();
8192   assert(!Ty->isVectorTy() && "Val must be a scalar");
8193 
8194   if (Ty->isFloatingPointTy()) {
8195     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
8196 
8197     // Floating-point operations inherit FMF via the builder's flags.
8198     Value *MulOp = Builder.CreateFMul(C, Step);
8199     return Builder.CreateBinOp(BinOp, Val, MulOp);
8200   }
8201   Constant *C = ConstantInt::get(Ty, StartIdx);
8202   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
8203 }
8204 
8205 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
8206   SmallVector<Metadata *, 4> MDs;
8207   // Reserve first location for self reference to the LoopID metadata node.
8208   MDs.push_back(nullptr);
8209   bool IsUnrollMetadata = false;
8210   MDNode *LoopID = L->getLoopID();
8211   if (LoopID) {
8212     // First find existing loop unrolling disable metadata.
8213     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
8214       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
8215       if (MD) {
8216         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
8217         IsUnrollMetadata =
8218             S && S->getString().startswith("llvm.loop.unroll.disable");
8219       }
8220       MDs.push_back(LoopID->getOperand(i));
8221     }
8222   }
8223 
8224   if (!IsUnrollMetadata) {
8225     // Add runtime unroll disable metadata.
8226     LLVMContext &Context = L->getHeader()->getContext();
8227     SmallVector<Metadata *, 1> DisableOperands;
8228     DisableOperands.push_back(
8229         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
8230     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
8231     MDs.push_back(DisableNode);
8232     MDNode *NewLoopID = MDNode::get(Context, MDs);
8233     // Set operand 0 to refer to the loop id itself.
8234     NewLoopID->replaceOperandWith(0, NewLoopID);
8235     L->setLoopID(NewLoopID);
8236   }
8237 }
8238 
8239 //===--------------------------------------------------------------------===//
8240 // EpilogueVectorizerMainLoop
8241 //===--------------------------------------------------------------------===//
8242 
8243 /// This function is partially responsible for generating the control flow
8244 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
8245 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
8246   MDNode *OrigLoopID = OrigLoop->getLoopID();
8247   Loop *Lp = createVectorLoopSkeleton("");
8248 
8249   // Generate the code to check the minimum iteration count of the vector
8250   // epilogue (see below).
8251   EPI.EpilogueIterationCountCheck =
8252       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true);
8253   EPI.EpilogueIterationCountCheck->setName("iter.check");
8254 
8255   // Generate the code to check any assumptions that we've made for SCEV
8256   // expressions.
8257   EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader);
8258 
8259   // Generate the code that checks at runtime if arrays overlap. We put the
8260   // checks into a separate block to make the more common case of few elements
8261   // faster.
8262   EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
8263 
8264   // Generate the iteration count check for the main loop, *after* the check
8265   // for the epilogue loop, so that the path-length is shorter for the case
8266   // that goes directly through the vector epilogue. The longer-path length for
8267   // the main loop is compensated for, by the gain from vectorizing the larger
8268   // trip count. Note: the branch will get updated later on when we vectorize
8269   // the epilogue.
8270   EPI.MainLoopIterationCountCheck =
8271       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false);
8272 
8273   // Generate the induction variable.
8274   OldInduction = Legal->getPrimaryInduction();
8275   Type *IdxTy = Legal->getWidestInductionType();
8276   Value *StartIdx = ConstantInt::get(IdxTy, 0);
8277   Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
8278   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
8279   EPI.VectorTripCount = CountRoundDown;
8280   Induction =
8281       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
8282                               getDebugLocFromInstOrOperands(OldInduction));
8283 
8284   // Skip induction resume value creation here because they will be created in
8285   // the second pass. If we created them here, they wouldn't be used anyway,
8286   // because the vplan in the second pass still contains the inductions from the
8287   // original loop.
8288 
8289   return completeLoopSkeleton(Lp, OrigLoopID);
8290 }
8291 
8292 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
8293   LLVM_DEBUG({
8294     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
8295            << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue()
8296            << ", Main Loop UF:" << EPI.MainLoopUF
8297            << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
8298            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8299   });
8300 }
8301 
8302 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
8303   DEBUG_WITH_TYPE(VerboseDebug, {
8304     dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n";
8305   });
8306 }
8307 
8308 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
8309     Loop *L, BasicBlock *Bypass, bool ForEpilogue) {
8310   assert(L && "Expected valid Loop.");
8311   assert(Bypass && "Expected valid bypass basic block.");
8312   unsigned VFactor =
8313       ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue();
8314   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
8315   Value *Count = getOrCreateTripCount(L);
8316   // Reuse existing vector loop preheader for TC checks.
8317   // Note that new preheader block is generated for vector loop.
8318   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
8319   IRBuilder<> Builder(TCCheckBlock->getTerminator());
8320 
8321   // Generate code to check if the loop's trip count is less than VF * UF of the
8322   // main vector loop.
8323   auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ?
8324       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
8325 
8326   Value *CheckMinIters = Builder.CreateICmp(
8327       P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor),
8328       "min.iters.check");
8329 
8330   if (!ForEpilogue)
8331     TCCheckBlock->setName("vector.main.loop.iter.check");
8332 
8333   // Create new preheader for vector loop.
8334   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
8335                                    DT, LI, nullptr, "vector.ph");
8336 
8337   if (ForEpilogue) {
8338     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
8339                                  DT->getNode(Bypass)->getIDom()) &&
8340            "TC check is expected to dominate Bypass");
8341 
8342     // Update dominator for Bypass & LoopExit.
8343     DT->changeImmediateDominator(Bypass, TCCheckBlock);
8344     DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
8345 
8346     LoopBypassBlocks.push_back(TCCheckBlock);
8347 
8348     // Save the trip count so we don't have to regenerate it in the
8349     // vec.epilog.iter.check. This is safe to do because the trip count
8350     // generated here dominates the vector epilog iter check.
8351     EPI.TripCount = Count;
8352   }
8353 
8354   ReplaceInstWithInst(
8355       TCCheckBlock->getTerminator(),
8356       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
8357 
8358   return TCCheckBlock;
8359 }
8360 
8361 //===--------------------------------------------------------------------===//
8362 // EpilogueVectorizerEpilogueLoop
8363 //===--------------------------------------------------------------------===//
8364 
8365 /// This function is partially responsible for generating the control flow
8366 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
8367 BasicBlock *
8368 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
8369   MDNode *OrigLoopID = OrigLoop->getLoopID();
8370   Loop *Lp = createVectorLoopSkeleton("vec.epilog.");
8371 
8372   // Now, compare the remaining count and if there aren't enough iterations to
8373   // execute the vectorized epilogue skip to the scalar part.
8374   BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
8375   VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
8376   LoopVectorPreHeader =
8377       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
8378                  LI, nullptr, "vec.epilog.ph");
8379   emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader,
8380                                           VecEpilogueIterationCountCheck);
8381 
8382   // Adjust the control flow taking the state info from the main loop
8383   // vectorization into account.
8384   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
8385          "expected this to be saved from the previous pass.");
8386   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
8387       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
8388 
8389   DT->changeImmediateDominator(LoopVectorPreHeader,
8390                                EPI.MainLoopIterationCountCheck);
8391 
8392   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
8393       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8394 
8395   if (EPI.SCEVSafetyCheck)
8396     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
8397         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8398   if (EPI.MemSafetyCheck)
8399     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
8400         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8401 
8402   DT->changeImmediateDominator(
8403       VecEpilogueIterationCountCheck,
8404       VecEpilogueIterationCountCheck->getSinglePredecessor());
8405 
8406   DT->changeImmediateDominator(LoopScalarPreHeader,
8407                                EPI.EpilogueIterationCountCheck);
8408   DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck);
8409 
8410   // Keep track of bypass blocks, as they feed start values to the induction
8411   // phis in the scalar loop preheader.
8412   if (EPI.SCEVSafetyCheck)
8413     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
8414   if (EPI.MemSafetyCheck)
8415     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
8416   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
8417 
8418   // Generate a resume induction for the vector epilogue and put it in the
8419   // vector epilogue preheader
8420   Type *IdxTy = Legal->getWidestInductionType();
8421   PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
8422                                          LoopVectorPreHeader->getFirstNonPHI());
8423   EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
8424   EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
8425                            EPI.MainLoopIterationCountCheck);
8426 
8427   // Generate the induction variable.
8428   OldInduction = Legal->getPrimaryInduction();
8429   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
8430   Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
8431   Value *StartIdx = EPResumeVal;
8432   Induction =
8433       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
8434                               getDebugLocFromInstOrOperands(OldInduction));
8435 
8436   // Generate induction resume values. These variables save the new starting
8437   // indexes for the scalar loop. They are used to test if there are any tail
8438   // iterations left once the vector loop has completed.
8439   // Note that when the vectorized epilogue is skipped due to iteration count
8440   // check, then the resume value for the induction variable comes from
8441   // the trip count of the main vector loop, hence passing the AdditionalBypass
8442   // argument.
8443   createInductionResumeValues(Lp, CountRoundDown,
8444                               {VecEpilogueIterationCountCheck,
8445                                EPI.VectorTripCount} /* AdditionalBypass */);
8446 
8447   AddRuntimeUnrollDisableMetaData(Lp);
8448   return completeLoopSkeleton(Lp, OrigLoopID);
8449 }
8450 
8451 BasicBlock *
8452 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
8453     Loop *L, BasicBlock *Bypass, BasicBlock *Insert) {
8454 
8455   assert(EPI.TripCount &&
8456          "Expected trip count to have been safed in the first pass.");
8457   assert(
8458       (!isa<Instruction>(EPI.TripCount) ||
8459        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
8460       "saved trip count does not dominate insertion point.");
8461   Value *TC = EPI.TripCount;
8462   IRBuilder<> Builder(Insert->getTerminator());
8463   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
8464 
8465   // Generate code to check if the loop's trip count is less than VF * UF of the
8466   // vector epilogue loop.
8467   auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ?
8468       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
8469 
8470   Value *CheckMinIters = Builder.CreateICmp(
8471       P, Count,
8472       ConstantInt::get(Count->getType(),
8473                        EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF),
8474       "min.epilog.iters.check");
8475 
8476   ReplaceInstWithInst(
8477       Insert->getTerminator(),
8478       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
8479 
8480   LoopBypassBlocks.push_back(Insert);
8481   return Insert;
8482 }
8483 
8484 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
8485   LLVM_DEBUG({
8486     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
8487            << "Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
8488            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8489   });
8490 }
8491 
8492 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
8493   DEBUG_WITH_TYPE(VerboseDebug, {
8494     dbgs() << "final fn:\n" << *Induction->getFunction() << "\n";
8495   });
8496 }
8497 
8498 bool LoopVectorizationPlanner::getDecisionAndClampRange(
8499     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
8500   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
8501   bool PredicateAtRangeStart = Predicate(Range.Start);
8502 
8503   for (ElementCount TmpVF = Range.Start * 2;
8504        ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
8505     if (Predicate(TmpVF) != PredicateAtRangeStart) {
8506       Range.End = TmpVF;
8507       break;
8508     }
8509 
8510   return PredicateAtRangeStart;
8511 }
8512 
8513 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
8514 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
8515 /// of VF's starting at a given VF and extending it as much as possible. Each
8516 /// vectorization decision can potentially shorten this sub-range during
8517 /// buildVPlan().
8518 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
8519                                            ElementCount MaxVF) {
8520   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8521   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8522     VFRange SubRange = {VF, MaxVFPlusOne};
8523     VPlans.push_back(buildVPlan(SubRange));
8524     VF = SubRange.End;
8525   }
8526 }
8527 
8528 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
8529                                          VPlanPtr &Plan) {
8530   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8531 
8532   // Look for cached value.
8533   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8534   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8535   if (ECEntryIt != EdgeMaskCache.end())
8536     return ECEntryIt->second;
8537 
8538   VPValue *SrcMask = createBlockInMask(Src, Plan);
8539 
8540   // The terminator has to be a branch inst!
8541   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8542   assert(BI && "Unexpected terminator found");
8543 
8544   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8545     return EdgeMaskCache[Edge] = SrcMask;
8546 
8547   // If source is an exiting block, we know the exit edge is dynamically dead
8548   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
8549   // adding uses of an otherwise potentially dead instruction.
8550   if (OrigLoop->isLoopExiting(Src))
8551     return EdgeMaskCache[Edge] = SrcMask;
8552 
8553   VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
8554   assert(EdgeMask && "No Edge Mask found for condition");
8555 
8556   if (BI->getSuccessor(0) != Dst)
8557     EdgeMask = Builder.createNot(EdgeMask);
8558 
8559   if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8560     // The condition is 'SrcMask && EdgeMask', which is equivalent to
8561     // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8562     // The select version does not introduce new UB if SrcMask is false and
8563     // EdgeMask is poison. Using 'and' here introduces undefined behavior.
8564     VPValue *False = Plan->getOrAddVPValue(
8565         ConstantInt::getFalse(BI->getCondition()->getType()));
8566     EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False);
8567   }
8568 
8569   return EdgeMaskCache[Edge] = EdgeMask;
8570 }
8571 
8572 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
8573   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8574 
8575   // Look for cached value.
8576   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
8577   if (BCEntryIt != BlockMaskCache.end())
8578     return BCEntryIt->second;
8579 
8580   // All-one mask is modelled as no-mask following the convention for masked
8581   // load/store/gather/scatter. Initialize BlockMask to no-mask.
8582   VPValue *BlockMask = nullptr;
8583 
8584   if (OrigLoop->getHeader() == BB) {
8585     if (!CM.blockNeedsPredication(BB))
8586       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
8587 
8588     // Create the block in mask as the first non-phi instruction in the block.
8589     VPBuilder::InsertPointGuard Guard(Builder);
8590     auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi();
8591     Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint);
8592 
8593     // Introduce the early-exit compare IV <= BTC to form header block mask.
8594     // This is used instead of IV < TC because TC may wrap, unlike BTC.
8595     // Start by constructing the desired canonical IV.
8596     VPValue *IV = nullptr;
8597     if (Legal->getPrimaryInduction())
8598       IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction());
8599     else {
8600       auto IVRecipe = new VPWidenCanonicalIVRecipe();
8601       Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint);
8602       IV = IVRecipe->getVPSingleValue();
8603     }
8604     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
8605     bool TailFolded = !CM.isScalarEpilogueAllowed();
8606 
8607     if (TailFolded && CM.TTI.emitGetActiveLaneMask()) {
8608       // While ActiveLaneMask is a binary op that consumes the loop tripcount
8609       // as a second argument, we only pass the IV here and extract the
8610       // tripcount from the transform state where codegen of the VP instructions
8611       // happen.
8612       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV});
8613     } else {
8614       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
8615     }
8616     return BlockMaskCache[BB] = BlockMask;
8617   }
8618 
8619   // This is the block mask. We OR all incoming edges.
8620   for (auto *Predecessor : predecessors(BB)) {
8621     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8622     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
8623       return BlockMaskCache[BB] = EdgeMask;
8624 
8625     if (!BlockMask) { // BlockMask has its initialized nullptr value.
8626       BlockMask = EdgeMask;
8627       continue;
8628     }
8629 
8630     BlockMask = Builder.createOr(BlockMask, EdgeMask);
8631   }
8632 
8633   return BlockMaskCache[BB] = BlockMask;
8634 }
8635 
8636 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
8637                                                 ArrayRef<VPValue *> Operands,
8638                                                 VFRange &Range,
8639                                                 VPlanPtr &Plan) {
8640   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8641          "Must be called with either a load or store");
8642 
8643   auto willWiden = [&](ElementCount VF) -> bool {
8644     if (VF.isScalar())
8645       return false;
8646     LoopVectorizationCostModel::InstWidening Decision =
8647         CM.getWideningDecision(I, VF);
8648     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8649            "CM decision should be taken at this point.");
8650     if (Decision == LoopVectorizationCostModel::CM_Interleave)
8651       return true;
8652     if (CM.isScalarAfterVectorization(I, VF) ||
8653         CM.isProfitableToScalarize(I, VF))
8654       return false;
8655     return Decision != LoopVectorizationCostModel::CM_Scalarize;
8656   };
8657 
8658   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8659     return nullptr;
8660 
8661   VPValue *Mask = nullptr;
8662   if (Legal->isMaskRequired(I))
8663     Mask = createBlockInMask(I->getParent(), Plan);
8664 
8665   if (LoadInst *Load = dyn_cast<LoadInst>(I))
8666     return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask);
8667 
8668   StoreInst *Store = cast<StoreInst>(I);
8669   return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0],
8670                                             Mask);
8671 }
8672 
8673 VPWidenIntOrFpInductionRecipe *
8674 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi,
8675                                            ArrayRef<VPValue *> Operands) const {
8676   // Check if this is an integer or fp induction. If so, build the recipe that
8677   // produces its scalar and vector values.
8678   InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
8679   if (II.getKind() == InductionDescriptor::IK_IntInduction ||
8680       II.getKind() == InductionDescriptor::IK_FpInduction) {
8681     assert(II.getStartValue() ==
8682            Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8683     const SmallVectorImpl<Instruction *> &Casts = II.getCastInsts();
8684     return new VPWidenIntOrFpInductionRecipe(
8685         Phi, Operands[0], Casts.empty() ? nullptr : Casts.front());
8686   }
8687 
8688   return nullptr;
8689 }
8690 
8691 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8692     TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range,
8693     VPlan &Plan) const {
8694   // Optimize the special case where the source is a constant integer
8695   // induction variable. Notice that we can only optimize the 'trunc' case
8696   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8697   // (c) other casts depend on pointer size.
8698 
8699   // Determine whether \p K is a truncation based on an induction variable that
8700   // can be optimized.
8701   auto isOptimizableIVTruncate =
8702       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8703     return [=](ElementCount VF) -> bool {
8704       return CM.isOptimizableIVTruncate(K, VF);
8705     };
8706   };
8707 
8708   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8709           isOptimizableIVTruncate(I), Range)) {
8710 
8711     InductionDescriptor II =
8712         Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0)));
8713     VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
8714     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
8715                                              Start, nullptr, I);
8716   }
8717   return nullptr;
8718 }
8719 
8720 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
8721                                                 ArrayRef<VPValue *> Operands,
8722                                                 VPlanPtr &Plan) {
8723   // If all incoming values are equal, the incoming VPValue can be used directly
8724   // instead of creating a new VPBlendRecipe.
8725   VPValue *FirstIncoming = Operands[0];
8726   if (all_of(Operands, [FirstIncoming](const VPValue *Inc) {
8727         return FirstIncoming == Inc;
8728       })) {
8729     return Operands[0];
8730   }
8731 
8732   // We know that all PHIs in non-header blocks are converted into selects, so
8733   // we don't have to worry about the insertion order and we can just use the
8734   // builder. At this point we generate the predication tree. There may be
8735   // duplications since this is a simple recursive scan, but future
8736   // optimizations will clean it up.
8737   SmallVector<VPValue *, 2> OperandsWithMask;
8738   unsigned NumIncoming = Phi->getNumIncomingValues();
8739 
8740   for (unsigned In = 0; In < NumIncoming; In++) {
8741     VPValue *EdgeMask =
8742       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
8743     assert((EdgeMask || NumIncoming == 1) &&
8744            "Multiple predecessors with one having a full mask");
8745     OperandsWithMask.push_back(Operands[In]);
8746     if (EdgeMask)
8747       OperandsWithMask.push_back(EdgeMask);
8748   }
8749   return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask));
8750 }
8751 
8752 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8753                                                    ArrayRef<VPValue *> Operands,
8754                                                    VFRange &Range) const {
8755 
8756   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8757       [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI); },
8758       Range);
8759 
8760   if (IsPredicated)
8761     return nullptr;
8762 
8763   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8764   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8765              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8766              ID == Intrinsic::pseudoprobe ||
8767              ID == Intrinsic::experimental_noalias_scope_decl))
8768     return nullptr;
8769 
8770   auto willWiden = [&](ElementCount VF) -> bool {
8771     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8772     // The following case may be scalarized depending on the VF.
8773     // The flag shows whether we use Intrinsic or a usual Call for vectorized
8774     // version of the instruction.
8775     // Is it beneficial to perform intrinsic call compared to lib call?
8776     bool NeedToScalarize = false;
8777     InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
8778     InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;
8779     bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
8780     assert((IntrinsicCost.isValid() || CallCost.isValid()) &&
8781            "Either the intrinsic cost or vector call cost must be valid");
8782     return UseVectorIntrinsic || !NeedToScalarize;
8783   };
8784 
8785   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8786     return nullptr;
8787 
8788   ArrayRef<VPValue *> Ops = Operands.take_front(CI->getNumArgOperands());
8789   return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()));
8790 }
8791 
8792 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8793   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8794          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8795   // Instruction should be widened, unless it is scalar after vectorization,
8796   // scalarization is profitable or it is predicated.
8797   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8798     return CM.isScalarAfterVectorization(I, VF) ||
8799            CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I);
8800   };
8801   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8802                                                              Range);
8803 }
8804 
8805 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8806                                            ArrayRef<VPValue *> Operands) const {
8807   auto IsVectorizableOpcode = [](unsigned Opcode) {
8808     switch (Opcode) {
8809     case Instruction::Add:
8810     case Instruction::And:
8811     case Instruction::AShr:
8812     case Instruction::BitCast:
8813     case Instruction::FAdd:
8814     case Instruction::FCmp:
8815     case Instruction::FDiv:
8816     case Instruction::FMul:
8817     case Instruction::FNeg:
8818     case Instruction::FPExt:
8819     case Instruction::FPToSI:
8820     case Instruction::FPToUI:
8821     case Instruction::FPTrunc:
8822     case Instruction::FRem:
8823     case Instruction::FSub:
8824     case Instruction::ICmp:
8825     case Instruction::IntToPtr:
8826     case Instruction::LShr:
8827     case Instruction::Mul:
8828     case Instruction::Or:
8829     case Instruction::PtrToInt:
8830     case Instruction::SDiv:
8831     case Instruction::Select:
8832     case Instruction::SExt:
8833     case Instruction::Shl:
8834     case Instruction::SIToFP:
8835     case Instruction::SRem:
8836     case Instruction::Sub:
8837     case Instruction::Trunc:
8838     case Instruction::UDiv:
8839     case Instruction::UIToFP:
8840     case Instruction::URem:
8841     case Instruction::Xor:
8842     case Instruction::ZExt:
8843       return true;
8844     }
8845     return false;
8846   };
8847 
8848   if (!IsVectorizableOpcode(I->getOpcode()))
8849     return nullptr;
8850 
8851   // Success: widen this instruction.
8852   return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8853 }
8854 
8855 void VPRecipeBuilder::fixHeaderPhis() {
8856   BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8857   for (VPWidenPHIRecipe *R : PhisToFix) {
8858     auto *PN = cast<PHINode>(R->getUnderlyingValue());
8859     VPRecipeBase *IncR =
8860         getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8861     R->addOperand(IncR->getVPSingleValue());
8862   }
8863 }
8864 
8865 VPBasicBlock *VPRecipeBuilder::handleReplication(
8866     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
8867     VPlanPtr &Plan) {
8868   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8869       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8870       Range);
8871 
8872   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8873       [&](ElementCount VF) { return CM.isPredicatedInst(I); }, Range);
8874 
8875   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
8876                                        IsUniform, IsPredicated);
8877   setRecipe(I, Recipe);
8878   Plan->addVPValue(I, Recipe);
8879 
8880   // Find if I uses a predicated instruction. If so, it will use its scalar
8881   // value. Avoid hoisting the insert-element which packs the scalar value into
8882   // a vector value, as that happens iff all users use the vector value.
8883   for (VPValue *Op : Recipe->operands()) {
8884     auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef());
8885     if (!PredR)
8886       continue;
8887     auto *RepR =
8888         cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef());
8889     assert(RepR->isPredicated() &&
8890            "expected Replicate recipe to be predicated");
8891     RepR->setAlsoPack(false);
8892   }
8893 
8894   // Finalize the recipe for Instr, first if it is not predicated.
8895   if (!IsPredicated) {
8896     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8897     VPBB->appendRecipe(Recipe);
8898     return VPBB;
8899   }
8900   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8901   assert(VPBB->getSuccessors().empty() &&
8902          "VPBB has successors when handling predicated replication.");
8903   // Record predicated instructions for above packing optimizations.
8904   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
8905   VPBlockUtils::insertBlockAfter(Region, VPBB);
8906   auto *RegSucc = new VPBasicBlock();
8907   VPBlockUtils::insertBlockAfter(RegSucc, Region);
8908   return RegSucc;
8909 }
8910 
8911 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
8912                                                       VPRecipeBase *PredRecipe,
8913                                                       VPlanPtr &Plan) {
8914   // Instructions marked for predication are replicated and placed under an
8915   // if-then construct to prevent side-effects.
8916 
8917   // Generate recipes to compute the block mask for this region.
8918   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
8919 
8920   // Build the triangular if-then region.
8921   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
8922   assert(Instr->getParent() && "Predicated instruction not in any basic block");
8923   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
8924   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
8925   auto *PHIRecipe = Instr->getType()->isVoidTy()
8926                         ? nullptr
8927                         : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));
8928   if (PHIRecipe) {
8929     Plan->removeVPValueFor(Instr);
8930     Plan->addVPValue(Instr, PHIRecipe);
8931   }
8932   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
8933   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
8934   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
8935 
8936   // Note: first set Entry as region entry and then connect successors starting
8937   // from it in order, to propagate the "parent" of each VPBasicBlock.
8938   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
8939   VPBlockUtils::connectBlocks(Pred, Exit);
8940 
8941   return Region;
8942 }
8943 
8944 VPRecipeOrVPValueTy
8945 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8946                                         ArrayRef<VPValue *> Operands,
8947                                         VFRange &Range, VPlanPtr &Plan) {
8948   // First, check for specific widening recipes that deal with calls, memory
8949   // operations, inductions and Phi nodes.
8950   if (auto *CI = dyn_cast<CallInst>(Instr))
8951     return toVPRecipeResult(tryToWidenCall(CI, Operands, Range));
8952 
8953   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8954     return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
8955 
8956   VPRecipeBase *Recipe;
8957   if (auto Phi = dyn_cast<PHINode>(Instr)) {
8958     if (Phi->getParent() != OrigLoop->getHeader())
8959       return tryToBlend(Phi, Operands, Plan);
8960     if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands)))
8961       return toVPRecipeResult(Recipe);
8962 
8963     VPWidenPHIRecipe *PhiRecipe = nullptr;
8964     if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) {
8965       VPValue *StartV = Operands[0];
8966       if (Legal->isReductionVariable(Phi)) {
8967         RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
8968         assert(RdxDesc.getRecurrenceStartValue() ==
8969                Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8970         PhiRecipe = new VPWidenPHIRecipe(Phi, RdxDesc, *StartV);
8971       } else {
8972         PhiRecipe = new VPWidenPHIRecipe(Phi, *StartV);
8973       }
8974 
8975       // Record the incoming value from the backedge, so we can add the incoming
8976       // value from the backedge after all recipes have been created.
8977       recordRecipeOf(cast<Instruction>(
8978           Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())));
8979       PhisToFix.push_back(PhiRecipe);
8980     } else {
8981       // TODO: record start and backedge value for remaining pointer induction
8982       // phis.
8983       assert(Phi->getType()->isPointerTy() &&
8984              "only pointer phis should be handled here");
8985       PhiRecipe = new VPWidenPHIRecipe(Phi);
8986     }
8987 
8988     return toVPRecipeResult(PhiRecipe);
8989   }
8990 
8991   if (isa<TruncInst>(Instr) &&
8992       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,
8993                                                Range, *Plan)))
8994     return toVPRecipeResult(Recipe);
8995 
8996   if (!shouldWiden(Instr, Range))
8997     return nullptr;
8998 
8999   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
9000     return toVPRecipeResult(new VPWidenGEPRecipe(
9001         GEP, make_range(Operands.begin(), Operands.end()), OrigLoop));
9002 
9003   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
9004     bool InvariantCond =
9005         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
9006     return toVPRecipeResult(new VPWidenSelectRecipe(
9007         *SI, make_range(Operands.begin(), Operands.end()), InvariantCond));
9008   }
9009 
9010   return toVPRecipeResult(tryToWiden(Instr, Operands));
9011 }
9012 
9013 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
9014                                                         ElementCount MaxVF) {
9015   assert(OrigLoop->isInnermost() && "Inner loop expected.");
9016 
9017   // Collect instructions from the original loop that will become trivially dead
9018   // in the vectorized loop. We don't need to vectorize these instructions. For
9019   // example, original induction update instructions can become dead because we
9020   // separately emit induction "steps" when generating code for the new loop.
9021   // Similarly, we create a new latch condition when setting up the structure
9022   // of the new loop, so the old one can become dead.
9023   SmallPtrSet<Instruction *, 4> DeadInstructions;
9024   collectTriviallyDeadInstructions(DeadInstructions);
9025 
9026   // Add assume instructions we need to drop to DeadInstructions, to prevent
9027   // them from being added to the VPlan.
9028   // TODO: We only need to drop assumes in blocks that get flattend. If the
9029   // control flow is preserved, we should keep them.
9030   auto &ConditionalAssumes = Legal->getConditionalAssumes();
9031   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
9032 
9033   MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
9034   // Dead instructions do not need sinking. Remove them from SinkAfter.
9035   for (Instruction *I : DeadInstructions)
9036     SinkAfter.erase(I);
9037 
9038   // Cannot sink instructions after dead instructions (there won't be any
9039   // recipes for them). Instead, find the first non-dead previous instruction.
9040   for (auto &P : Legal->getSinkAfter()) {
9041     Instruction *SinkTarget = P.second;
9042     Instruction *FirstInst = &*SinkTarget->getParent()->begin();
9043     (void)FirstInst;
9044     while (DeadInstructions.contains(SinkTarget)) {
9045       assert(
9046           SinkTarget != FirstInst &&
9047           "Must find a live instruction (at least the one feeding the "
9048           "first-order recurrence PHI) before reaching beginning of the block");
9049       SinkTarget = SinkTarget->getPrevNode();
9050       assert(SinkTarget != P.first &&
9051              "sink source equals target, no sinking required");
9052     }
9053     P.second = SinkTarget;
9054   }
9055 
9056   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
9057   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
9058     VFRange SubRange = {VF, MaxVFPlusOne};
9059     VPlans.push_back(
9060         buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
9061     VF = SubRange.End;
9062   }
9063 }
9064 
9065 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
9066     VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
9067     const MapVector<Instruction *, Instruction *> &SinkAfter) {
9068 
9069   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
9070 
9071   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
9072 
9073   // ---------------------------------------------------------------------------
9074   // Pre-construction: record ingredients whose recipes we'll need to further
9075   // process after constructing the initial VPlan.
9076   // ---------------------------------------------------------------------------
9077 
9078   // Mark instructions we'll need to sink later and their targets as
9079   // ingredients whose recipe we'll need to record.
9080   for (auto &Entry : SinkAfter) {
9081     RecipeBuilder.recordRecipeOf(Entry.first);
9082     RecipeBuilder.recordRecipeOf(Entry.second);
9083   }
9084   for (auto &Reduction : CM.getInLoopReductionChains()) {
9085     PHINode *Phi = Reduction.first;
9086     RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind();
9087     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
9088 
9089     RecipeBuilder.recordRecipeOf(Phi);
9090     for (auto &R : ReductionOperations) {
9091       RecipeBuilder.recordRecipeOf(R);
9092       // For min/max reducitons, where we have a pair of icmp/select, we also
9093       // need to record the ICmp recipe, so it can be removed later.
9094       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
9095         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
9096     }
9097   }
9098 
9099   // For each interleave group which is relevant for this (possibly trimmed)
9100   // Range, add it to the set of groups to be later applied to the VPlan and add
9101   // placeholders for its members' Recipes which we'll be replacing with a
9102   // single VPInterleaveRecipe.
9103   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
9104     auto applyIG = [IG, this](ElementCount VF) -> bool {
9105       return (VF.isVector() && // Query is illegal for VF == 1
9106               CM.getWideningDecision(IG->getInsertPos(), VF) ==
9107                   LoopVectorizationCostModel::CM_Interleave);
9108     };
9109     if (!getDecisionAndClampRange(applyIG, Range))
9110       continue;
9111     InterleaveGroups.insert(IG);
9112     for (unsigned i = 0; i < IG->getFactor(); i++)
9113       if (Instruction *Member = IG->getMember(i))
9114         RecipeBuilder.recordRecipeOf(Member);
9115   };
9116 
9117   // ---------------------------------------------------------------------------
9118   // Build initial VPlan: Scan the body of the loop in a topological order to
9119   // visit each basic block after having visited its predecessor basic blocks.
9120   // ---------------------------------------------------------------------------
9121 
9122   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
9123   auto Plan = std::make_unique<VPlan>();
9124   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
9125   Plan->setEntry(VPBB);
9126 
9127   // Scan the body of the loop in a topological order to visit each basic block
9128   // after having visited its predecessor basic blocks.
9129   LoopBlocksDFS DFS(OrigLoop);
9130   DFS.perform(LI);
9131 
9132   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
9133     // Relevant instructions from basic block BB will be grouped into VPRecipe
9134     // ingredients and fill a new VPBasicBlock.
9135     unsigned VPBBsForBB = 0;
9136     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
9137     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
9138     VPBB = FirstVPBBForBB;
9139     Builder.setInsertPoint(VPBB);
9140 
9141     // Introduce each ingredient into VPlan.
9142     // TODO: Model and preserve debug instrinsics in VPlan.
9143     for (Instruction &I : BB->instructionsWithoutDebug()) {
9144       Instruction *Instr = &I;
9145 
9146       // First filter out irrelevant instructions, to ensure no recipes are
9147       // built for them.
9148       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
9149         continue;
9150 
9151       SmallVector<VPValue *, 4> Operands;
9152       auto *Phi = dyn_cast<PHINode>(Instr);
9153       if (Phi && Phi->getParent() == OrigLoop->getHeader()) {
9154         Operands.push_back(Plan->getOrAddVPValue(
9155             Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
9156       } else {
9157         auto OpRange = Plan->mapToVPValues(Instr->operands());
9158         Operands = {OpRange.begin(), OpRange.end()};
9159       }
9160       if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
9161               Instr, Operands, Range, Plan)) {
9162         // If Instr can be simplified to an existing VPValue, use it.
9163         if (RecipeOrValue.is<VPValue *>()) {
9164           auto *VPV = RecipeOrValue.get<VPValue *>();
9165           Plan->addVPValue(Instr, VPV);
9166           // If the re-used value is a recipe, register the recipe for the
9167           // instruction, in case the recipe for Instr needs to be recorded.
9168           if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef()))
9169             RecipeBuilder.setRecipe(Instr, R);
9170           continue;
9171         }
9172         // Otherwise, add the new recipe.
9173         VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>();
9174         for (auto *Def : Recipe->definedValues()) {
9175           auto *UV = Def->getUnderlyingValue();
9176           Plan->addVPValue(UV, Def);
9177         }
9178 
9179         RecipeBuilder.setRecipe(Instr, Recipe);
9180         VPBB->appendRecipe(Recipe);
9181         continue;
9182       }
9183 
9184       // Otherwise, if all widening options failed, Instruction is to be
9185       // replicated. This may create a successor for VPBB.
9186       VPBasicBlock *NextVPBB =
9187           RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan);
9188       if (NextVPBB != VPBB) {
9189         VPBB = NextVPBB;
9190         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
9191                                     : "");
9192       }
9193     }
9194   }
9195 
9196   RecipeBuilder.fixHeaderPhis();
9197 
9198   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
9199   // may also be empty, such as the last one VPBB, reflecting original
9200   // basic-blocks with no recipes.
9201   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
9202   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
9203   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
9204   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
9205   delete PreEntry;
9206 
9207   // ---------------------------------------------------------------------------
9208   // Transform initial VPlan: Apply previously taken decisions, in order, to
9209   // bring the VPlan to its final state.
9210   // ---------------------------------------------------------------------------
9211 
9212   // Apply Sink-After legal constraints.
9213   for (auto &Entry : SinkAfter) {
9214     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
9215     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
9216 
9217     auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * {
9218       auto *Region =
9219           dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent());
9220       if (Region && Region->isReplicator()) {
9221         assert(Region->getNumSuccessors() == 1 &&
9222                Region->getNumPredecessors() == 1 && "Expected SESE region!");
9223         assert(R->getParent()->size() == 1 &&
9224                "A recipe in an original replicator region must be the only "
9225                "recipe in its block");
9226         return Region;
9227       }
9228       return nullptr;
9229     };
9230     auto *TargetRegion = GetReplicateRegion(Target);
9231     auto *SinkRegion = GetReplicateRegion(Sink);
9232     if (!SinkRegion) {
9233       // If the sink source is not a replicate region, sink the recipe directly.
9234       if (TargetRegion) {
9235         // The target is in a replication region, make sure to move Sink to
9236         // the block after it, not into the replication region itself.
9237         VPBasicBlock *NextBlock =
9238             cast<VPBasicBlock>(TargetRegion->getSuccessors().front());
9239         Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi());
9240       } else
9241         Sink->moveAfter(Target);
9242       continue;
9243     }
9244 
9245     // The sink source is in a replicate region. Unhook the region from the CFG.
9246     auto *SinkPred = SinkRegion->getSinglePredecessor();
9247     auto *SinkSucc = SinkRegion->getSingleSuccessor();
9248     VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion);
9249     VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc);
9250     VPBlockUtils::connectBlocks(SinkPred, SinkSucc);
9251 
9252     if (TargetRegion) {
9253       // The target recipe is also in a replicate region, move the sink region
9254       // after the target region.
9255       auto *TargetSucc = TargetRegion->getSingleSuccessor();
9256       VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc);
9257       VPBlockUtils::connectBlocks(TargetRegion, SinkRegion);
9258       VPBlockUtils::connectBlocks(SinkRegion, TargetSucc);
9259     } else {
9260       // The sink source is in a replicate region, we need to move the whole
9261       // replicate region, which should only contain a single recipe in the main
9262       // block.
9263       auto *SplitBlock =
9264           Target->getParent()->splitAt(std::next(Target->getIterator()));
9265 
9266       auto *SplitPred = SplitBlock->getSinglePredecessor();
9267 
9268       VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock);
9269       VPBlockUtils::connectBlocks(SplitPred, SinkRegion);
9270       VPBlockUtils::connectBlocks(SinkRegion, SplitBlock);
9271       if (VPBB == SplitPred)
9272         VPBB = SplitBlock;
9273     }
9274   }
9275 
9276   // Interleave memory: for each Interleave Group we marked earlier as relevant
9277   // for this VPlan, replace the Recipes widening its memory instructions with a
9278   // single VPInterleaveRecipe at its insertion point.
9279   for (auto IG : InterleaveGroups) {
9280     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
9281         RecipeBuilder.getRecipe(IG->getInsertPos()));
9282     SmallVector<VPValue *, 4> StoredValues;
9283     for (unsigned i = 0; i < IG->getFactor(); ++i)
9284       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i)))
9285         StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0)));
9286 
9287     auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
9288                                         Recipe->getMask());
9289     VPIG->insertBefore(Recipe);
9290     unsigned J = 0;
9291     for (unsigned i = 0; i < IG->getFactor(); ++i)
9292       if (Instruction *Member = IG->getMember(i)) {
9293         if (!Member->getType()->isVoidTy()) {
9294           VPValue *OriginalV = Plan->getVPValue(Member);
9295           Plan->removeVPValueFor(Member);
9296           Plan->addVPValue(Member, VPIG->getVPValue(J));
9297           OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
9298           J++;
9299         }
9300         RecipeBuilder.getRecipe(Member)->eraseFromParent();
9301       }
9302   }
9303 
9304   // Adjust the recipes for any inloop reductions.
9305   adjustRecipesForInLoopReductions(Plan, RecipeBuilder, Range.Start);
9306 
9307   // Finally, if tail is folded by masking, introduce selects between the phi
9308   // and the live-out instruction of each reduction, at the end of the latch.
9309   if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) {
9310     Builder.setInsertPoint(VPBB);
9311     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
9312     for (auto &Reduction : Legal->getReductionVars()) {
9313       if (CM.isInLoopReduction(Reduction.first))
9314         continue;
9315       VPValue *Phi = Plan->getOrAddVPValue(Reduction.first);
9316       VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr());
9317       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
9318     }
9319   }
9320 
9321   VPlanTransforms::sinkScalarOperands(*Plan);
9322   VPlanTransforms::mergeReplicateRegions(*Plan);
9323 
9324   std::string PlanName;
9325   raw_string_ostream RSO(PlanName);
9326   ElementCount VF = Range.Start;
9327   Plan->addVF(VF);
9328   RSO << "Initial VPlan for VF={" << VF;
9329   for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
9330     Plan->addVF(VF);
9331     RSO << "," << VF;
9332   }
9333   RSO << "},UF>=1";
9334   RSO.flush();
9335   Plan->setName(PlanName);
9336 
9337   return Plan;
9338 }
9339 
9340 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9341   // Outer loop handling: They may require CFG and instruction level
9342   // transformations before even evaluating whether vectorization is profitable.
9343   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
9344   // the vectorization pipeline.
9345   assert(!OrigLoop->isInnermost());
9346   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
9347 
9348   // Create new empty VPlan
9349   auto Plan = std::make_unique<VPlan>();
9350 
9351   // Build hierarchical CFG
9352   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
9353   HCFGBuilder.buildHierarchicalCFG();
9354 
9355   for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
9356        VF *= 2)
9357     Plan->addVF(VF);
9358 
9359   if (EnableVPlanPredication) {
9360     VPlanPredicator VPP(*Plan);
9361     VPP.predicate();
9362 
9363     // Avoid running transformation to recipes until masked code generation in
9364     // VPlan-native path is in place.
9365     return Plan;
9366   }
9367 
9368   SmallPtrSet<Instruction *, 1> DeadInstructions;
9369   VPlanTransforms::VPInstructionsToVPRecipes(OrigLoop, Plan,
9370                                              Legal->getInductionVars(),
9371                                              DeadInstructions, *PSE.getSE());
9372   return Plan;
9373 }
9374 
9375 // Adjust the recipes for any inloop reductions. The chain of instructions
9376 // leading from the loop exit instr to the phi need to be converted to
9377 // reductions, with one operand being vector and the other being the scalar
9378 // reduction chain.
9379 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions(
9380     VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
9381   for (auto &Reduction : CM.getInLoopReductionChains()) {
9382     PHINode *Phi = Reduction.first;
9383     RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
9384     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
9385 
9386     if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc))
9387       continue;
9388 
9389     // ReductionOperations are orders top-down from the phi's use to the
9390     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
9391     // which of the two operands will remain scalar and which will be reduced.
9392     // For minmax the chain will be the select instructions.
9393     Instruction *Chain = Phi;
9394     for (Instruction *R : ReductionOperations) {
9395       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
9396       RecurKind Kind = RdxDesc.getRecurrenceKind();
9397 
9398       VPValue *ChainOp = Plan->getVPValue(Chain);
9399       unsigned FirstOpId;
9400       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9401         assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
9402                "Expected to replace a VPWidenSelectSC");
9403         FirstOpId = 1;
9404       } else {
9405         assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe)) &&
9406                "Expected to replace a VPWidenSC");
9407         FirstOpId = 0;
9408       }
9409       unsigned VecOpId =
9410           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
9411       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
9412 
9413       auto *CondOp = CM.foldTailByMasking()
9414                          ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
9415                          : nullptr;
9416       VPReductionRecipe *RedRecipe = new VPReductionRecipe(
9417           &RdxDesc, R, ChainOp, VecOp, CondOp, TTI);
9418       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9419       Plan->removeVPValueFor(R);
9420       Plan->addVPValue(R, RedRecipe);
9421       WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
9422       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9423       WidenRecipe->eraseFromParent();
9424 
9425       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9426         VPRecipeBase *CompareRecipe =
9427             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
9428         assert(isa<VPWidenRecipe>(CompareRecipe) &&
9429                "Expected to replace a VPWidenSC");
9430         assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 &&
9431                "Expected no remaining users");
9432         CompareRecipe->eraseFromParent();
9433       }
9434       Chain = R;
9435     }
9436   }
9437 }
9438 
9439 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
9440 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
9441                                VPSlotTracker &SlotTracker) const {
9442   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9443   IG->getInsertPos()->printAsOperand(O, false);
9444   O << ", ";
9445   getAddr()->printAsOperand(O, SlotTracker);
9446   VPValue *Mask = getMask();
9447   if (Mask) {
9448     O << ", ";
9449     Mask->printAsOperand(O, SlotTracker);
9450   }
9451   for (unsigned i = 0; i < IG->getFactor(); ++i)
9452     if (Instruction *I = IG->getMember(i))
9453       O << "\n" << Indent << "  " << VPlanIngredient(I) << " " << i;
9454 }
9455 #endif
9456 
9457 void VPWidenCallRecipe::execute(VPTransformState &State) {
9458   State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
9459                                   *this, State);
9460 }
9461 
9462 void VPWidenSelectRecipe::execute(VPTransformState &State) {
9463   State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()),
9464                                     this, *this, InvariantCond, State);
9465 }
9466 
9467 void VPWidenRecipe::execute(VPTransformState &State) {
9468   State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State);
9469 }
9470 
9471 void VPWidenGEPRecipe::execute(VPTransformState &State) {
9472   State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this,
9473                       *this, State.UF, State.VF, IsPtrLoopInvariant,
9474                       IsIndexLoopInvariant, State);
9475 }
9476 
9477 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
9478   assert(!State.Instance && "Int or FP induction being replicated.");
9479   State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(),
9480                                    getTruncInst(), getVPValue(0),
9481                                    getCastValue(), State);
9482 }
9483 
9484 void VPWidenPHIRecipe::execute(VPTransformState &State) {
9485   State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), RdxDesc,
9486                                  this, State);
9487 }
9488 
9489 void VPBlendRecipe::execute(VPTransformState &State) {
9490   State.ILV->setDebugLocFromInst(Phi, &State.Builder);
9491   // We know that all PHIs in non-header blocks are converted into
9492   // selects, so we don't have to worry about the insertion order and we
9493   // can just use the builder.
9494   // At this point we generate the predication tree. There may be
9495   // duplications since this is a simple recursive scan, but future
9496   // optimizations will clean it up.
9497 
9498   unsigned NumIncoming = getNumIncomingValues();
9499 
9500   // Generate a sequence of selects of the form:
9501   // SELECT(Mask3, In3,
9502   //        SELECT(Mask2, In2,
9503   //               SELECT(Mask1, In1,
9504   //                      In0)))
9505   // Note that Mask0 is never used: lanes for which no path reaches this phi and
9506   // are essentially undef are taken from In0.
9507   InnerLoopVectorizer::VectorParts Entry(State.UF);
9508   for (unsigned In = 0; In < NumIncoming; ++In) {
9509     for (unsigned Part = 0; Part < State.UF; ++Part) {
9510       // We might have single edge PHIs (blocks) - use an identity
9511       // 'select' for the first PHI operand.
9512       Value *In0 = State.get(getIncomingValue(In), Part);
9513       if (In == 0)
9514         Entry[Part] = In0; // Initialize with the first incoming value.
9515       else {
9516         // Select between the current value and the previous incoming edge
9517         // based on the incoming mask.
9518         Value *Cond = State.get(getMask(In), Part);
9519         Entry[Part] =
9520             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
9521       }
9522     }
9523   }
9524   for (unsigned Part = 0; Part < State.UF; ++Part)
9525     State.set(this, Entry[Part], Part);
9526 }
9527 
9528 void VPInterleaveRecipe::execute(VPTransformState &State) {
9529   assert(!State.Instance && "Interleave group being replicated.");
9530   State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9531                                       getStoredValues(), getMask());
9532 }
9533 
9534 void VPReductionRecipe::execute(VPTransformState &State) {
9535   assert(!State.Instance && "Reduction being replicated.");
9536   Value *PrevInChain = State.get(getChainOp(), 0);
9537   for (unsigned Part = 0; Part < State.UF; ++Part) {
9538     RecurKind Kind = RdxDesc->getRecurrenceKind();
9539     bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc);
9540     Value *NewVecOp = State.get(getVecOp(), Part);
9541     if (VPValue *Cond = getCondOp()) {
9542       Value *NewCond = State.get(Cond, Part);
9543       VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
9544       Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
9545           Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags());
9546       Constant *IdenVec =
9547           ConstantVector::getSplat(VecTy->getElementCount(), Iden);
9548       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
9549       NewVecOp = Select;
9550     }
9551     Value *NewRed;
9552     Value *NextInChain;
9553     if (IsOrdered) {
9554       if (State.VF.isVector())
9555         NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp,
9556                                         PrevInChain);
9557       else
9558         NewRed = State.Builder.CreateBinOp(
9559             (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(),
9560             PrevInChain, NewVecOp);
9561       PrevInChain = NewRed;
9562     } else {
9563       PrevInChain = State.get(getChainOp(), Part);
9564       NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
9565     }
9566     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9567       NextInChain =
9568           createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(),
9569                          NewRed, PrevInChain);
9570     } else if (IsOrdered)
9571       NextInChain = NewRed;
9572     else {
9573       NextInChain = State.Builder.CreateBinOp(
9574           (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed,
9575           PrevInChain);
9576     }
9577     State.set(this, NextInChain, Part);
9578   }
9579 }
9580 
9581 void VPReplicateRecipe::execute(VPTransformState &State) {
9582   if (State.Instance) { // Generate a single instance.
9583     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9584     State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this,
9585                                     *State.Instance, IsPredicated, State);
9586     // Insert scalar instance packing it into a vector.
9587     if (AlsoPack && State.VF.isVector()) {
9588       // If we're constructing lane 0, initialize to start from poison.
9589       if (State.Instance->Lane.isFirstLane()) {
9590         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9591         Value *Poison = PoisonValue::get(
9592             VectorType::get(getUnderlyingValue()->getType(), State.VF));
9593         State.set(this, Poison, State.Instance->Part);
9594       }
9595       State.ILV->packScalarIntoVectorValue(this, *State.Instance, State);
9596     }
9597     return;
9598   }
9599 
9600   // Generate scalar instances for all VF lanes of all UF parts, unless the
9601   // instruction is uniform inwhich case generate only the first lane for each
9602   // of the UF parts.
9603   unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
9604   assert((!State.VF.isScalable() || IsUniform) &&
9605          "Can't scalarize a scalable vector");
9606   for (unsigned Part = 0; Part < State.UF; ++Part)
9607     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9608       State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this,
9609                                       VPIteration(Part, Lane), IsPredicated,
9610                                       State);
9611 }
9612 
9613 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
9614   assert(State.Instance && "Branch on Mask works only on single instance.");
9615 
9616   unsigned Part = State.Instance->Part;
9617   unsigned Lane = State.Instance->Lane.getKnownLane();
9618 
9619   Value *ConditionBit = nullptr;
9620   VPValue *BlockInMask = getMask();
9621   if (BlockInMask) {
9622     ConditionBit = State.get(BlockInMask, Part);
9623     if (ConditionBit->getType()->isVectorTy())
9624       ConditionBit = State.Builder.CreateExtractElement(
9625           ConditionBit, State.Builder.getInt32(Lane));
9626   } else // Block in mask is all-one.
9627     ConditionBit = State.Builder.getTrue();
9628 
9629   // Replace the temporary unreachable terminator with a new conditional branch,
9630   // whose two destinations will be set later when they are created.
9631   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
9632   assert(isa<UnreachableInst>(CurrentTerminator) &&
9633          "Expected to replace unreachable terminator with conditional branch.");
9634   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
9635   CondBr->setSuccessor(0, nullptr);
9636   ReplaceInstWithInst(CurrentTerminator, CondBr);
9637 }
9638 
9639 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
9640   assert(State.Instance && "Predicated instruction PHI works per instance.");
9641   Instruction *ScalarPredInst =
9642       cast<Instruction>(State.get(getOperand(0), *State.Instance));
9643   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
9644   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
9645   assert(PredicatingBB && "Predicated block has no single predecessor.");
9646   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
9647          "operand must be VPReplicateRecipe");
9648 
9649   // By current pack/unpack logic we need to generate only a single phi node: if
9650   // a vector value for the predicated instruction exists at this point it means
9651   // the instruction has vector users only, and a phi for the vector value is
9652   // needed. In this case the recipe of the predicated instruction is marked to
9653   // also do that packing, thereby "hoisting" the insert-element sequence.
9654   // Otherwise, a phi node for the scalar value is needed.
9655   unsigned Part = State.Instance->Part;
9656   if (State.hasVectorValue(getOperand(0), Part)) {
9657     Value *VectorValue = State.get(getOperand(0), Part);
9658     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
9659     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
9660     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
9661     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
9662     if (State.hasVectorValue(this, Part))
9663       State.reset(this, VPhi, Part);
9664     else
9665       State.set(this, VPhi, Part);
9666     // NOTE: Currently we need to update the value of the operand, so the next
9667     // predicated iteration inserts its generated value in the correct vector.
9668     State.reset(getOperand(0), VPhi, Part);
9669   } else {
9670     Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
9671     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
9672     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
9673                      PredicatingBB);
9674     Phi->addIncoming(ScalarPredInst, PredicatedBB);
9675     if (State.hasScalarValue(this, *State.Instance))
9676       State.reset(this, Phi, *State.Instance);
9677     else
9678       State.set(this, Phi, *State.Instance);
9679     // NOTE: Currently we need to update the value of the operand, so the next
9680     // predicated iteration inserts its generated value in the correct vector.
9681     State.reset(getOperand(0), Phi, *State.Instance);
9682   }
9683 }
9684 
9685 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
9686   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
9687   State.ILV->vectorizeMemoryInstruction(
9688       &Ingredient, State, StoredValue ? nullptr : getVPSingleValue(), getAddr(),
9689       StoredValue, getMask());
9690 }
9691 
9692 // Determine how to lower the scalar epilogue, which depends on 1) optimising
9693 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9694 // predication, and 4) a TTI hook that analyses whether the loop is suitable
9695 // for predication.
9696 static ScalarEpilogueLowering getScalarEpilogueLowering(
9697     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
9698     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
9699     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
9700     LoopVectorizationLegality &LVL) {
9701   // 1) OptSize takes precedence over all other options, i.e. if this is set,
9702   // don't look at hints or options, and don't request a scalar epilogue.
9703   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9704   // LoopAccessInfo (due to code dependency and not being able to reliably get
9705   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9706   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9707   // versioning when the vectorization is forced, unlike hasOptSize. So revert
9708   // back to the old way and vectorize with versioning when forced. See D81345.)
9709   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9710                                                       PGSOQueryType::IRPass) &&
9711                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
9712     return CM_ScalarEpilogueNotAllowedOptSize;
9713 
9714   // 2) If set, obey the directives
9715   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9716     switch (PreferPredicateOverEpilogue) {
9717     case PreferPredicateTy::ScalarEpilogue:
9718       return CM_ScalarEpilogueAllowed;
9719     case PreferPredicateTy::PredicateElseScalarEpilogue:
9720       return CM_ScalarEpilogueNotNeededUsePredicate;
9721     case PreferPredicateTy::PredicateOrDontVectorize:
9722       return CM_ScalarEpilogueNotAllowedUsePredicate;
9723     };
9724   }
9725 
9726   // 3) If set, obey the hints
9727   switch (Hints.getPredicate()) {
9728   case LoopVectorizeHints::FK_Enabled:
9729     return CM_ScalarEpilogueNotNeededUsePredicate;
9730   case LoopVectorizeHints::FK_Disabled:
9731     return CM_ScalarEpilogueAllowed;
9732   };
9733 
9734   // 4) if the TTI hook indicates this is profitable, request predication.
9735   if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
9736                                        LVL.getLAI()))
9737     return CM_ScalarEpilogueNotNeededUsePredicate;
9738 
9739   return CM_ScalarEpilogueAllowed;
9740 }
9741 
9742 Value *VPTransformState::get(VPValue *Def, unsigned Part) {
9743   // If Values have been set for this Def return the one relevant for \p Part.
9744   if (hasVectorValue(Def, Part))
9745     return Data.PerPartOutput[Def][Part];
9746 
9747   if (!hasScalarValue(Def, {Part, 0})) {
9748     Value *IRV = Def->getLiveInIRValue();
9749     Value *B = ILV->getBroadcastInstrs(IRV);
9750     set(Def, B, Part);
9751     return B;
9752   }
9753 
9754   Value *ScalarValue = get(Def, {Part, 0});
9755   // If we aren't vectorizing, we can just copy the scalar map values over
9756   // to the vector map.
9757   if (VF.isScalar()) {
9758     set(Def, ScalarValue, Part);
9759     return ScalarValue;
9760   }
9761 
9762   auto *RepR = dyn_cast<VPReplicateRecipe>(Def);
9763   bool IsUniform = RepR && RepR->isUniform();
9764 
9765   unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;
9766   // Check if there is a scalar value for the selected lane.
9767   if (!hasScalarValue(Def, {Part, LastLane})) {
9768     // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform.
9769     assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) &&
9770            "unexpected recipe found to be invariant");
9771     IsUniform = true;
9772     LastLane = 0;
9773   }
9774 
9775   auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane}));
9776   // Set the insert point after the last scalarized instruction or after the
9777   // last PHI, if LastInst is a PHI. This ensures the insertelement sequence
9778   // will directly follow the scalar definitions.
9779   auto OldIP = Builder.saveIP();
9780   auto NewIP =
9781       isa<PHINode>(LastInst)
9782           ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI())
9783           : std::next(BasicBlock::iterator(LastInst));
9784   Builder.SetInsertPoint(&*NewIP);
9785 
9786   // However, if we are vectorizing, we need to construct the vector values.
9787   // If the value is known to be uniform after vectorization, we can just
9788   // broadcast the scalar value corresponding to lane zero for each unroll
9789   // iteration. Otherwise, we construct the vector values using
9790   // insertelement instructions. Since the resulting vectors are stored in
9791   // State, we will only generate the insertelements once.
9792   Value *VectorValue = nullptr;
9793   if (IsUniform) {
9794     VectorValue = ILV->getBroadcastInstrs(ScalarValue);
9795     set(Def, VectorValue, Part);
9796   } else {
9797     // Initialize packing with insertelements to start from undef.
9798     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
9799     Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
9800     set(Def, Undef, Part);
9801     for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
9802       ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this);
9803     VectorValue = get(Def, Part);
9804   }
9805   Builder.restoreIP(OldIP);
9806   return VectorValue;
9807 }
9808 
9809 // Process the loop in the VPlan-native vectorization path. This path builds
9810 // VPlan upfront in the vectorization pipeline, which allows to apply
9811 // VPlan-to-VPlan transformations from the very beginning without modifying the
9812 // input LLVM IR.
9813 static bool processLoopInVPlanNativePath(
9814     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
9815     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
9816     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
9817     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
9818     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
9819     LoopVectorizationRequirements &Requirements) {
9820 
9821   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
9822     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9823     return false;
9824   }
9825   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9826   Function *F = L->getHeader()->getParent();
9827   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9828 
9829   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
9830       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
9831 
9832   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9833                                 &Hints, IAI);
9834   // Use the planner for outer loop vectorization.
9835   // TODO: CM is not used at this point inside the planner. Turn CM into an
9836   // optional argument if we don't need it in the future.
9837   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints,
9838                                Requirements, ORE);
9839 
9840   // Get user vectorization factor.
9841   ElementCount UserVF = Hints.getWidth();
9842 
9843   // Plan how to best vectorize, return the best VF and its cost.
9844   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9845 
9846   // If we are stress testing VPlan builds, do not attempt to generate vector
9847   // code. Masked vector code generation support will follow soon.
9848   // Also, do not attempt to vectorize if no vector code will be produced.
9849   if (VPlanBuildStressTest || EnableVPlanPredication ||
9850       VectorizationFactor::Disabled() == VF)
9851     return false;
9852 
9853   LVP.setBestPlan(VF.Width, 1);
9854 
9855   {
9856     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
9857                              F->getParent()->getDataLayout());
9858     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
9859                            &CM, BFI, PSI, Checks);
9860     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
9861                       << L->getHeader()->getParent()->getName() << "\"\n");
9862     LVP.executePlan(LB, DT);
9863   }
9864 
9865   // Mark the loop as already vectorized to avoid vectorizing again.
9866   Hints.setAlreadyVectorized();
9867   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9868   return true;
9869 }
9870 
9871 // Emit a remark if there are stores to floats that required a floating point
9872 // extension. If the vectorized loop was generated with floating point there
9873 // will be a performance penalty from the conversion overhead and the change in
9874 // the vector width.
9875 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
9876   SmallVector<Instruction *, 4> Worklist;
9877   for (BasicBlock *BB : L->getBlocks()) {
9878     for (Instruction &Inst : *BB) {
9879       if (auto *S = dyn_cast<StoreInst>(&Inst)) {
9880         if (S->getValueOperand()->getType()->isFloatTy())
9881           Worklist.push_back(S);
9882       }
9883     }
9884   }
9885 
9886   // Traverse the floating point stores upwards searching, for floating point
9887   // conversions.
9888   SmallPtrSet<const Instruction *, 4> Visited;
9889   SmallPtrSet<const Instruction *, 4> EmittedRemark;
9890   while (!Worklist.empty()) {
9891     auto *I = Worklist.pop_back_val();
9892     if (!L->contains(I))
9893       continue;
9894     if (!Visited.insert(I).second)
9895       continue;
9896 
9897     // Emit a remark if the floating point store required a floating
9898     // point conversion.
9899     // TODO: More work could be done to identify the root cause such as a
9900     // constant or a function return type and point the user to it.
9901     if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
9902       ORE->emit([&]() {
9903         return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
9904                                           I->getDebugLoc(), L->getHeader())
9905                << "floating point conversion changes vector width. "
9906                << "Mixed floating point precision requires an up/down "
9907                << "cast that will negatively impact performance.";
9908       });
9909 
9910     for (Use &Op : I->operands())
9911       if (auto *OpI = dyn_cast<Instruction>(Op))
9912         Worklist.push_back(OpI);
9913   }
9914 }
9915 
9916 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
9917     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
9918                                !EnableLoopInterleaving),
9919       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
9920                               !EnableLoopVectorization) {}
9921 
9922 bool LoopVectorizePass::processLoop(Loop *L) {
9923   assert((EnableVPlanNativePath || L->isInnermost()) &&
9924          "VPlan-native path is not enabled. Only process inner loops.");
9925 
9926 #ifndef NDEBUG
9927   const std::string DebugLocStr = getDebugLocString(L);
9928 #endif /* NDEBUG */
9929 
9930   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
9931                     << L->getHeader()->getParent()->getName() << "\" from "
9932                     << DebugLocStr << "\n");
9933 
9934   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
9935 
9936   LLVM_DEBUG(
9937       dbgs() << "LV: Loop hints:"
9938              << " force="
9939              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
9940                      ? "disabled"
9941                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
9942                             ? "enabled"
9943                             : "?"))
9944              << " width=" << Hints.getWidth()
9945              << " interleave=" << Hints.getInterleave() << "\n");
9946 
9947   // Function containing loop
9948   Function *F = L->getHeader()->getParent();
9949 
9950   // Looking at the diagnostic output is the only way to determine if a loop
9951   // was vectorized (other than looking at the IR or machine code), so it
9952   // is important to generate an optimization remark for each loop. Most of
9953   // these messages are generated as OptimizationRemarkAnalysis. Remarks
9954   // generated as OptimizationRemark and OptimizationRemarkMissed are
9955   // less verbose reporting vectorized loops and unvectorized loops that may
9956   // benefit from vectorization, respectively.
9957 
9958   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
9959     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
9960     return false;
9961   }
9962 
9963   PredicatedScalarEvolution PSE(*SE, *L);
9964 
9965   // Check if it is legal to vectorize the loop.
9966   LoopVectorizationRequirements Requirements;
9967   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
9968                                 &Requirements, &Hints, DB, AC, BFI, PSI);
9969   if (!LVL.canVectorize(EnableVPlanNativePath)) {
9970     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
9971     Hints.emitRemarkWithHints();
9972     return false;
9973   }
9974 
9975   // Check the function attributes and profiles to find out if this function
9976   // should be optimized for size.
9977   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
9978       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
9979 
9980   // Entrance to the VPlan-native vectorization path. Outer loops are processed
9981   // here. They may require CFG and instruction level transformations before
9982   // even evaluating whether vectorization is profitable. Since we cannot modify
9983   // the incoming IR, we need to build VPlan upfront in the vectorization
9984   // pipeline.
9985   if (!L->isInnermost())
9986     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
9987                                         ORE, BFI, PSI, Hints, Requirements);
9988 
9989   assert(L->isInnermost() && "Inner loop expected.");
9990 
9991   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
9992   // count by optimizing for size, to minimize overheads.
9993   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
9994   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
9995     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
9996                       << "This loop is worth vectorizing only if no scalar "
9997                       << "iteration overheads are incurred.");
9998     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
9999       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
10000     else {
10001       LLVM_DEBUG(dbgs() << "\n");
10002       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
10003     }
10004   }
10005 
10006   // Check the function attributes to see if implicit floats are allowed.
10007   // FIXME: This check doesn't seem possibly correct -- what if the loop is
10008   // an integer loop and the vector instructions selected are purely integer
10009   // vector instructions?
10010   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
10011     reportVectorizationFailure(
10012         "Can't vectorize when the NoImplicitFloat attribute is used",
10013         "loop not vectorized due to NoImplicitFloat attribute",
10014         "NoImplicitFloat", ORE, L);
10015     Hints.emitRemarkWithHints();
10016     return false;
10017   }
10018 
10019   // Check if the target supports potentially unsafe FP vectorization.
10020   // FIXME: Add a check for the type of safety issue (denormal, signaling)
10021   // for the target we're vectorizing for, to make sure none of the
10022   // additional fp-math flags can help.
10023   if (Hints.isPotentiallyUnsafe() &&
10024       TTI->isFPVectorizationPotentiallyUnsafe()) {
10025     reportVectorizationFailure(
10026         "Potentially unsafe FP op prevents vectorization",
10027         "loop not vectorized due to unsafe FP support.",
10028         "UnsafeFP", ORE, L);
10029     Hints.emitRemarkWithHints();
10030     return false;
10031   }
10032 
10033   if (!LVL.canVectorizeFPMath(EnableStrictReductions)) {
10034     ORE->emit([&]() {
10035       auto *ExactFPMathInst = Requirements.getExactFPInst();
10036       return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
10037                                                  ExactFPMathInst->getDebugLoc(),
10038                                                  ExactFPMathInst->getParent())
10039              << "loop not vectorized: cannot prove it is safe to reorder "
10040                 "floating-point operations";
10041     });
10042     LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10043                          "reorder floating-point operations\n");
10044     Hints.emitRemarkWithHints();
10045     return false;
10046   }
10047 
10048   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
10049   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
10050 
10051   // If an override option has been passed in for interleaved accesses, use it.
10052   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
10053     UseInterleaved = EnableInterleavedMemAccesses;
10054 
10055   // Analyze interleaved memory accesses.
10056   if (UseInterleaved) {
10057     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
10058   }
10059 
10060   // Use the cost model.
10061   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10062                                 F, &Hints, IAI);
10063   CM.collectValuesToIgnore();
10064 
10065   // Use the planner for vectorization.
10066   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints,
10067                                Requirements, ORE);
10068 
10069   // Get user vectorization factor and interleave count.
10070   ElementCount UserVF = Hints.getWidth();
10071   unsigned UserIC = Hints.getInterleave();
10072 
10073   // Plan how to best vectorize, return the best VF and its cost.
10074   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
10075 
10076   VectorizationFactor VF = VectorizationFactor::Disabled();
10077   unsigned IC = 1;
10078 
10079   if (MaybeVF) {
10080     VF = *MaybeVF;
10081     // Select the interleave count.
10082     IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue());
10083   }
10084 
10085   // Identify the diagnostic messages that should be produced.
10086   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10087   bool VectorizeLoop = true, InterleaveLoop = true;
10088   if (VF.Width.isScalar()) {
10089     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10090     VecDiagMsg = std::make_pair(
10091         "VectorizationNotBeneficial",
10092         "the cost-model indicates that vectorization is not beneficial");
10093     VectorizeLoop = false;
10094   }
10095 
10096   if (!MaybeVF && UserIC > 1) {
10097     // Tell the user interleaving was avoided up-front, despite being explicitly
10098     // requested.
10099     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10100                          "interleaving should be avoided up front\n");
10101     IntDiagMsg = std::make_pair(
10102         "InterleavingAvoided",
10103         "Ignoring UserIC, because interleaving was avoided up front");
10104     InterleaveLoop = false;
10105   } else if (IC == 1 && UserIC <= 1) {
10106     // Tell the user interleaving is not beneficial.
10107     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10108     IntDiagMsg = std::make_pair(
10109         "InterleavingNotBeneficial",
10110         "the cost-model indicates that interleaving is not beneficial");
10111     InterleaveLoop = false;
10112     if (UserIC == 1) {
10113       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10114       IntDiagMsg.second +=
10115           " and is explicitly disabled or interleave count is set to 1";
10116     }
10117   } else if (IC > 1 && UserIC == 1) {
10118     // Tell the user interleaving is beneficial, but it explicitly disabled.
10119     LLVM_DEBUG(
10120         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10121     IntDiagMsg = std::make_pair(
10122         "InterleavingBeneficialButDisabled",
10123         "the cost-model indicates that interleaving is beneficial "
10124         "but is explicitly disabled or interleave count is set to 1");
10125     InterleaveLoop = false;
10126   }
10127 
10128   // Override IC if user provided an interleave count.
10129   IC = UserIC > 0 ? UserIC : IC;
10130 
10131   // Emit diagnostic messages, if any.
10132   const char *VAPassName = Hints.vectorizeAnalysisPassName();
10133   if (!VectorizeLoop && !InterleaveLoop) {
10134     // Do not vectorize or interleaving the loop.
10135     ORE->emit([&]() {
10136       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10137                                       L->getStartLoc(), L->getHeader())
10138              << VecDiagMsg.second;
10139     });
10140     ORE->emit([&]() {
10141       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10142                                       L->getStartLoc(), L->getHeader())
10143              << IntDiagMsg.second;
10144     });
10145     return false;
10146   } else if (!VectorizeLoop && InterleaveLoop) {
10147     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10148     ORE->emit([&]() {
10149       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10150                                         L->getStartLoc(), L->getHeader())
10151              << VecDiagMsg.second;
10152     });
10153   } else if (VectorizeLoop && !InterleaveLoop) {
10154     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10155                       << ") in " << DebugLocStr << '\n');
10156     ORE->emit([&]() {
10157       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10158                                         L->getStartLoc(), L->getHeader())
10159              << IntDiagMsg.second;
10160     });
10161   } else if (VectorizeLoop && InterleaveLoop) {
10162     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10163                       << ") in " << DebugLocStr << '\n');
10164     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10165   }
10166 
10167   bool DisableRuntimeUnroll = false;
10168   MDNode *OrigLoopID = L->getLoopID();
10169   {
10170     // Optimistically generate runtime checks. Drop them if they turn out to not
10171     // be profitable. Limit the scope of Checks, so the cleanup happens
10172     // immediately after vector codegeneration is done.
10173     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
10174                              F->getParent()->getDataLayout());
10175     if (!VF.Width.isScalar() || IC > 1)
10176       Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate());
10177     LVP.setBestPlan(VF.Width, IC);
10178 
10179     using namespace ore;
10180     if (!VectorizeLoop) {
10181       assert(IC > 1 && "interleave count should not be 1 or 0");
10182       // If we decided that it is not legal to vectorize the loop, then
10183       // interleave it.
10184       InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
10185                                  &CM, BFI, PSI, Checks);
10186       LVP.executePlan(Unroller, DT);
10187 
10188       ORE->emit([&]() {
10189         return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10190                                   L->getHeader())
10191                << "interleaved loop (interleaved count: "
10192                << NV("InterleaveCount", IC) << ")";
10193       });
10194     } else {
10195       // If we decided that it is *legal* to vectorize the loop, then do it.
10196 
10197       // Consider vectorizing the epilogue too if it's profitable.
10198       VectorizationFactor EpilogueVF =
10199           CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
10200       if (EpilogueVF.Width.isVector()) {
10201 
10202         // The first pass vectorizes the main loop and creates a scalar epilogue
10203         // to be vectorized by executing the plan (potentially with a different
10204         // factor) again shortly afterwards.
10205         EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC,
10206                                           EpilogueVF.Width.getKnownMinValue(),
10207                                           1);
10208         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10209                                            EPI, &LVL, &CM, BFI, PSI, Checks);
10210 
10211         LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF);
10212         LVP.executePlan(MainILV, DT);
10213         ++LoopsVectorized;
10214 
10215         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10216         formLCSSARecursively(*L, *DT, LI, SE);
10217 
10218         // Second pass vectorizes the epilogue and adjusts the control flow
10219         // edges from the first pass.
10220         LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF);
10221         EPI.MainLoopVF = EPI.EpilogueVF;
10222         EPI.MainLoopUF = EPI.EpilogueUF;
10223         EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10224                                                  ORE, EPI, &LVL, &CM, BFI, PSI,
10225                                                  Checks);
10226         LVP.executePlan(EpilogILV, DT);
10227         ++LoopsEpilogueVectorized;
10228 
10229         if (!MainILV.areSafetyChecksAdded())
10230           DisableRuntimeUnroll = true;
10231       } else {
10232         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
10233                                &LVL, &CM, BFI, PSI, Checks);
10234         LVP.executePlan(LB, DT);
10235         ++LoopsVectorized;
10236 
10237         // Add metadata to disable runtime unrolling a scalar loop when there
10238         // are no runtime checks about strides and memory. A scalar loop that is
10239         // rarely used is not worth unrolling.
10240         if (!LB.areSafetyChecksAdded())
10241           DisableRuntimeUnroll = true;
10242       }
10243       // Report the vectorization decision.
10244       ORE->emit([&]() {
10245         return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
10246                                   L->getHeader())
10247                << "vectorized loop (vectorization width: "
10248                << NV("VectorizationFactor", VF.Width)
10249                << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
10250       });
10251     }
10252 
10253     if (ORE->allowExtraAnalysis(LV_NAME))
10254       checkMixedPrecision(L, ORE);
10255   }
10256 
10257   Optional<MDNode *> RemainderLoopID =
10258       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
10259                                       LLVMLoopVectorizeFollowupEpilogue});
10260   if (RemainderLoopID.hasValue()) {
10261     L->setLoopID(RemainderLoopID.getValue());
10262   } else {
10263     if (DisableRuntimeUnroll)
10264       AddRuntimeUnrollDisableMetaData(L);
10265 
10266     // Mark the loop as already vectorized to avoid vectorizing again.
10267     Hints.setAlreadyVectorized();
10268   }
10269 
10270   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10271   return true;
10272 }
10273 
10274 LoopVectorizeResult LoopVectorizePass::runImpl(
10275     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
10276     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
10277     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
10278     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
10279     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
10280   SE = &SE_;
10281   LI = &LI_;
10282   TTI = &TTI_;
10283   DT = &DT_;
10284   BFI = &BFI_;
10285   TLI = TLI_;
10286   AA = &AA_;
10287   AC = &AC_;
10288   GetLAA = &GetLAA_;
10289   DB = &DB_;
10290   ORE = &ORE_;
10291   PSI = PSI_;
10292 
10293   // Don't attempt if
10294   // 1. the target claims to have no vector registers, and
10295   // 2. interleaving won't help ILP.
10296   //
10297   // The second condition is necessary because, even if the target has no
10298   // vector registers, loop vectorization may still enable scalar
10299   // interleaving.
10300   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10301       TTI->getMaxInterleaveFactor(1) < 2)
10302     return LoopVectorizeResult(false, false);
10303 
10304   bool Changed = false, CFGChanged = false;
10305 
10306   // The vectorizer requires loops to be in simplified form.
10307   // Since simplification may add new inner loops, it has to run before the
10308   // legality and profitability checks. This means running the loop vectorizer
10309   // will simplify all loops, regardless of whether anything end up being
10310   // vectorized.
10311   for (auto &L : *LI)
10312     Changed |= CFGChanged |=
10313         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10314 
10315   // Build up a worklist of inner-loops to vectorize. This is necessary as
10316   // the act of vectorizing or partially unrolling a loop creates new loops
10317   // and can invalidate iterators across the loops.
10318   SmallVector<Loop *, 8> Worklist;
10319 
10320   for (Loop *L : *LI)
10321     collectSupportedLoops(*L, LI, ORE, Worklist);
10322 
10323   LoopsAnalyzed += Worklist.size();
10324 
10325   // Now walk the identified inner loops.
10326   while (!Worklist.empty()) {
10327     Loop *L = Worklist.pop_back_val();
10328 
10329     // For the inner loops we actually process, form LCSSA to simplify the
10330     // transform.
10331     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10332 
10333     Changed |= CFGChanged |= processLoop(L);
10334   }
10335 
10336   // Process each loop nest in the function.
10337   return LoopVectorizeResult(Changed, CFGChanged);
10338 }
10339 
10340 PreservedAnalyses LoopVectorizePass::run(Function &F,
10341                                          FunctionAnalysisManager &AM) {
10342     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
10343     auto &LI = AM.getResult<LoopAnalysis>(F);
10344     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
10345     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
10346     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
10347     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
10348     auto &AA = AM.getResult<AAManager>(F);
10349     auto &AC = AM.getResult<AssumptionAnalysis>(F);
10350     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
10351     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
10352     MemorySSA *MSSA = EnableMSSALoopDependency
10353                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
10354                           : nullptr;
10355 
10356     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
10357     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
10358         [&](Loop &L) -> const LoopAccessInfo & {
10359       LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,  SE,
10360                                         TLI, TTI, nullptr, MSSA};
10361       return LAM.getResult<LoopAccessAnalysis>(L, AR);
10362     };
10363     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10364     ProfileSummaryInfo *PSI =
10365         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10366     LoopVectorizeResult Result =
10367         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
10368     if (!Result.MadeAnyChange)
10369       return PreservedAnalyses::all();
10370     PreservedAnalyses PA;
10371 
10372     // We currently do not preserve loopinfo/dominator analyses with outer loop
10373     // vectorization. Until this is addressed, mark these analyses as preserved
10374     // only for non-VPlan-native path.
10375     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
10376     if (!EnableVPlanNativePath) {
10377       PA.preserve<LoopAnalysis>();
10378       PA.preserve<DominatorTreeAnalysis>();
10379     }
10380     if (!Result.MadeCFGChange)
10381       PA.preserveSet<CFGAnalyses>();
10382     return PA;
10383 }
10384