1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
95 #include "llvm/Analysis/TargetLibraryInfo.h"
96 #include "llvm/Analysis/TargetTransformInfo.h"
97 #include "llvm/Analysis/VectorUtils.h"
98 #include "llvm/IR/Attributes.h"
99 #include "llvm/IR/BasicBlock.h"
100 #include "llvm/IR/CFG.h"
101 #include "llvm/IR/Constant.h"
102 #include "llvm/IR/Constants.h"
103 #include "llvm/IR/DataLayout.h"
104 #include "llvm/IR/DebugInfoMetadata.h"
105 #include "llvm/IR/DebugLoc.h"
106 #include "llvm/IR/DerivedTypes.h"
107 #include "llvm/IR/DiagnosticInfo.h"
108 #include "llvm/IR/Dominators.h"
109 #include "llvm/IR/Function.h"
110 #include "llvm/IR/IRBuilder.h"
111 #include "llvm/IR/InstrTypes.h"
112 #include "llvm/IR/Instruction.h"
113 #include "llvm/IR/Instructions.h"
114 #include "llvm/IR/IntrinsicInst.h"
115 #include "llvm/IR/Intrinsics.h"
116 #include "llvm/IR/LLVMContext.h"
117 #include "llvm/IR/Metadata.h"
118 #include "llvm/IR/Module.h"
119 #include "llvm/IR/Operator.h"
120 #include "llvm/IR/Type.h"
121 #include "llvm/IR/Use.h"
122 #include "llvm/IR/User.h"
123 #include "llvm/IR/Value.h"
124 #include "llvm/IR/ValueHandle.h"
125 #include "llvm/IR/Verifier.h"
126 #include "llvm/InitializePasses.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
137 #include "llvm/Transforms/Utils/LoopSimplify.h"
138 #include "llvm/Transforms/Utils/LoopUtils.h"
139 #include "llvm/Transforms/Utils/LoopVersioning.h"
140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143 #include <algorithm>
144 #include <cassert>
145 #include <cstdint>
146 #include <cstdlib>
147 #include <functional>
148 #include <iterator>
149 #include <limits>
150 #include <memory>
151 #include <string>
152 #include <tuple>
153 #include <utility>
154 
155 using namespace llvm;
156 
157 #define LV_NAME "loop-vectorize"
158 #define DEBUG_TYPE LV_NAME
159 
160 /// @{
161 /// Metadata attribute names
162 static const char *const LLVMLoopVectorizeFollowupAll =
163     "llvm.loop.vectorize.followup_all";
164 static const char *const LLVMLoopVectorizeFollowupVectorized =
165     "llvm.loop.vectorize.followup_vectorized";
166 static const char *const LLVMLoopVectorizeFollowupEpilogue =
167     "llvm.loop.vectorize.followup_epilogue";
168 /// @}
169 
170 STATISTIC(LoopsVectorized, "Number of loops vectorized");
171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
172 
173 /// Loops with a known constant trip count below this number are vectorized only
174 /// if no scalar iteration overheads are incurred.
175 static cl::opt<unsigned> TinyTripCountVectorThreshold(
176     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
177     cl::desc("Loops with a constant trip count that is smaller than this "
178              "value are vectorized only if no scalar iteration overheads "
179              "are incurred."));
180 
181 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
182 // that predication is preferred, and this lists all options. I.e., the
183 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
184 // and predicate the instructions accordingly. If tail-folding fails, there are
185 // different fallback strategies depending on these values:
186 namespace PreferPredicateTy {
187   enum Option {
188     ScalarEpilogue = 0,
189     PredicateElseScalarEpilogue,
190     PredicateOrDontVectorize
191   };
192 } // namespace PreferPredicateTy
193 
194 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
195     "prefer-predicate-over-epilogue",
196     cl::init(PreferPredicateTy::ScalarEpilogue),
197     cl::Hidden,
198     cl::desc("Tail-folding and predication preferences over creating a scalar "
199              "epilogue loop."),
200     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
201                          "scalar-epilogue",
202                          "Don't tail-predicate loops, create scalar epilogue"),
203               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
204                          "predicate-else-scalar-epilogue",
205                          "prefer tail-folding, create scalar epilogue if tail "
206                          "folding fails."),
207               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
208                          "predicate-dont-vectorize",
209                          "prefers tail-folding, don't attempt vectorization if "
210                          "tail-folding fails.")));
211 
212 static cl::opt<bool> MaximizeBandwidth(
213     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
214     cl::desc("Maximize bandwidth when selecting vectorization factor which "
215              "will be determined by the smallest type in loop."));
216 
217 static cl::opt<bool> EnableInterleavedMemAccesses(
218     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
219     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
220 
221 /// An interleave-group may need masking if it resides in a block that needs
222 /// predication, or in order to mask away gaps.
223 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
224     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
225     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
226 
227 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
228     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
229     cl::desc("We don't interleave loops with a estimated constant trip count "
230              "below this number"));
231 
232 static cl::opt<unsigned> ForceTargetNumScalarRegs(
233     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
234     cl::desc("A flag that overrides the target's number of scalar registers."));
235 
236 static cl::opt<unsigned> ForceTargetNumVectorRegs(
237     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
238     cl::desc("A flag that overrides the target's number of vector registers."));
239 
240 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
241     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
242     cl::desc("A flag that overrides the target's max interleave factor for "
243              "scalar loops."));
244 
245 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
246     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
247     cl::desc("A flag that overrides the target's max interleave factor for "
248              "vectorized loops."));
249 
250 static cl::opt<unsigned> ForceTargetInstructionCost(
251     "force-target-instruction-cost", cl::init(0), cl::Hidden,
252     cl::desc("A flag that overrides the target's expected cost for "
253              "an instruction to a single constant value. Mostly "
254              "useful for getting consistent testing."));
255 
256 static cl::opt<unsigned> SmallLoopCost(
257     "small-loop-cost", cl::init(20), cl::Hidden,
258     cl::desc(
259         "The cost of a loop that is considered 'small' by the interleaver."));
260 
261 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
262     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
263     cl::desc("Enable the use of the block frequency analysis to access PGO "
264              "heuristics minimizing code growth in cold regions and being more "
265              "aggressive in hot regions."));
266 
267 // Runtime interleave loops for load/store throughput.
268 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
269     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
270     cl::desc(
271         "Enable runtime interleaving until load/store ports are saturated"));
272 
273 /// Interleave small loops with scalar reductions.
274 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
275     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
276     cl::desc("Enable interleaving for loops with small iteration counts that "
277              "contain scalar reductions to expose ILP."));
278 
279 /// The number of stores in a loop that are allowed to need predication.
280 static cl::opt<unsigned> NumberOfStoresToPredicate(
281     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
282     cl::desc("Max number of stores to be predicated behind an if."));
283 
284 static cl::opt<bool> EnableIndVarRegisterHeur(
285     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
286     cl::desc("Count the induction variable only once when interleaving"));
287 
288 static cl::opt<bool> EnableCondStoresVectorization(
289     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
290     cl::desc("Enable if predication of stores during vectorization."));
291 
292 static cl::opt<unsigned> MaxNestedScalarReductionIC(
293     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
294     cl::desc("The maximum interleave count to use when interleaving a scalar "
295              "reduction in a nested loop."));
296 
297 static cl::opt<bool>
298     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
299                            cl::Hidden,
300                            cl::desc("Prefer in-loop vector reductions, "
301                                     "overriding the targets preference."));
302 
303 static cl::opt<bool> PreferPredicatedReductionSelect(
304     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
305     cl::desc(
306         "Prefer predicating a reduction operation over an after loop select."));
307 
308 cl::opt<bool> EnableVPlanNativePath(
309     "enable-vplan-native-path", cl::init(false), cl::Hidden,
310     cl::desc("Enable VPlan-native vectorization path with "
311              "support for outer loop vectorization."));
312 
313 // FIXME: Remove this switch once we have divergence analysis. Currently we
314 // assume divergent non-backedge branches when this switch is true.
315 cl::opt<bool> EnableVPlanPredication(
316     "enable-vplan-predication", cl::init(false), cl::Hidden,
317     cl::desc("Enable VPlan-native vectorization path predicator with "
318              "support for outer loop vectorization."));
319 
320 // This flag enables the stress testing of the VPlan H-CFG construction in the
321 // VPlan-native vectorization path. It must be used in conjuction with
322 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
323 // verification of the H-CFGs built.
324 static cl::opt<bool> VPlanBuildStressTest(
325     "vplan-build-stress-test", cl::init(false), cl::Hidden,
326     cl::desc(
327         "Build VPlan for every supported loop nest in the function and bail "
328         "out right after the build (stress test the VPlan H-CFG construction "
329         "in the VPlan-native vectorization path)."));
330 
331 cl::opt<bool> llvm::EnableLoopInterleaving(
332     "interleave-loops", cl::init(true), cl::Hidden,
333     cl::desc("Enable loop interleaving in Loop vectorization passes"));
334 cl::opt<bool> llvm::EnableLoopVectorization(
335     "vectorize-loops", cl::init(true), cl::Hidden,
336     cl::desc("Run the Loop vectorization passes"));
337 
338 /// A helper function that returns the type of loaded or stored value.
339 static Type *getMemInstValueType(Value *I) {
340   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
341          "Expected Load or Store instruction");
342   if (auto *LI = dyn_cast<LoadInst>(I))
343     return LI->getType();
344   return cast<StoreInst>(I)->getValueOperand()->getType();
345 }
346 
347 /// A helper function that returns true if the given type is irregular. The
348 /// type is irregular if its allocated size doesn't equal the store size of an
349 /// element of the corresponding vector type at the given vectorization factor.
350 static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) {
351   assert(!VF.isScalable() && "scalable vectors not yet supported.");
352   // Determine if an array of VF elements of type Ty is "bitcast compatible"
353   // with a <VF x Ty> vector.
354   if (VF.isVector()) {
355     auto *VectorTy = VectorType::get(Ty, VF);
356     return TypeSize::get(VF.getKnownMinValue() *
357                              DL.getTypeAllocSize(Ty).getFixedValue(),
358                          VF.isScalable()) != DL.getTypeStoreSize(VectorTy);
359   }
360 
361   // If the vectorization factor is one, we just check if an array of type Ty
362   // requires padding between elements.
363   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
364 }
365 
366 /// A helper function that returns the reciprocal of the block probability of
367 /// predicated blocks. If we return X, we are assuming the predicated block
368 /// will execute once for every X iterations of the loop header.
369 ///
370 /// TODO: We should use actual block probability here, if available. Currently,
371 ///       we always assume predicated blocks have a 50% chance of executing.
372 static unsigned getReciprocalPredBlockProb() { return 2; }
373 
374 /// A helper function that adds a 'fast' flag to floating-point operations.
375 static Value *addFastMathFlag(Value *V) {
376   if (isa<FPMathOperator>(V))
377     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
378   return V;
379 }
380 
381 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
382   if (isa<FPMathOperator>(V))
383     cast<Instruction>(V)->setFastMathFlags(FMF);
384   return V;
385 }
386 
387 /// A helper function that returns an integer or floating-point constant with
388 /// value C.
389 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
390   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
391                            : ConstantFP::get(Ty, C);
392 }
393 
394 /// Returns "best known" trip count for the specified loop \p L as defined by
395 /// the following procedure:
396 ///   1) Returns exact trip count if it is known.
397 ///   2) Returns expected trip count according to profile data if any.
398 ///   3) Returns upper bound estimate if it is known.
399 ///   4) Returns None if all of the above failed.
400 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
401   // Check if exact trip count is known.
402   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
403     return ExpectedTC;
404 
405   // Check if there is an expected trip count available from profile data.
406   if (LoopVectorizeWithBlockFrequency)
407     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
408       return EstimatedTC;
409 
410   // Check if upper bound estimate is known.
411   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
412     return ExpectedTC;
413 
414   return None;
415 }
416 
417 namespace llvm {
418 
419 /// InnerLoopVectorizer vectorizes loops which contain only one basic
420 /// block to a specified vectorization factor (VF).
421 /// This class performs the widening of scalars into vectors, or multiple
422 /// scalars. This class also implements the following features:
423 /// * It inserts an epilogue loop for handling loops that don't have iteration
424 ///   counts that are known to be a multiple of the vectorization factor.
425 /// * It handles the code generation for reduction variables.
426 /// * Scalarization (implementation using scalars) of un-vectorizable
427 ///   instructions.
428 /// InnerLoopVectorizer does not perform any vectorization-legality
429 /// checks, and relies on the caller to check for the different legality
430 /// aspects. The InnerLoopVectorizer relies on the
431 /// LoopVectorizationLegality class to provide information about the induction
432 /// and reduction variables that were found to a given vectorization factor.
433 class InnerLoopVectorizer {
434 public:
435   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
436                       LoopInfo *LI, DominatorTree *DT,
437                       const TargetLibraryInfo *TLI,
438                       const TargetTransformInfo *TTI, AssumptionCache *AC,
439                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
440                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
441                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
442                       ProfileSummaryInfo *PSI)
443       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
444         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
445         Builder(PSE.getSE()->getContext()),
446         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM),
447         BFI(BFI), PSI(PSI) {
448     // Query this against the original loop and save it here because the profile
449     // of the original loop header may change as the transformation happens.
450     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
451         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
452   }
453 
454   virtual ~InnerLoopVectorizer() = default;
455 
456   /// Create a new empty loop that will contain vectorized instructions later
457   /// on, while the old loop will be used as the scalar remainder. Control flow
458   /// is generated around the vectorized (and scalar epilogue) loops consisting
459   /// of various checks and bypasses. Return the pre-header block of the new
460   /// loop.
461   BasicBlock *createVectorizedLoopSkeleton();
462 
463   /// Widen a single instruction within the innermost loop.
464   void widenInstruction(Instruction &I, VPUser &Operands,
465                         VPTransformState &State);
466 
467   /// Widen a single call instruction within the innermost loop.
468   void widenCallInstruction(CallInst &I, VPUser &ArgOperands,
469                             VPTransformState &State);
470 
471   /// Widen a single select instruction within the innermost loop.
472   void widenSelectInstruction(SelectInst &I, VPUser &Operands,
473                               bool InvariantCond, VPTransformState &State);
474 
475   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
476   void fixVectorizedLoop();
477 
478   // Return true if any runtime check is added.
479   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
480 
481   /// A type for vectorized values in the new loop. Each value from the
482   /// original loop, when vectorized, is represented by UF vector values in the
483   /// new unrolled loop, where UF is the unroll factor.
484   using VectorParts = SmallVector<Value *, 2>;
485 
486   /// Vectorize a single GetElementPtrInst based on information gathered and
487   /// decisions taken during planning.
488   void widenGEP(GetElementPtrInst *GEP, VPUser &Indices, unsigned UF,
489                 ElementCount VF, bool IsPtrLoopInvariant,
490                 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
491 
492   /// Vectorize a single PHINode in a block. This method handles the induction
493   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
494   /// arbitrary length vectors.
495   void widenPHIInstruction(Instruction *PN, unsigned UF, ElementCount VF);
496 
497   /// A helper function to scalarize a single Instruction in the innermost loop.
498   /// Generates a sequence of scalar instances for each lane between \p MinLane
499   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
500   /// inclusive. Uses the VPValue operands from \p Operands instead of \p
501   /// Instr's operands.
502   void scalarizeInstruction(Instruction *Instr, VPUser &Operands,
503                             const VPIteration &Instance, bool IfPredicateInstr,
504                             VPTransformState &State);
505 
506   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
507   /// is provided, the integer induction variable will first be truncated to
508   /// the corresponding type.
509   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
510 
511   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
512   /// vector or scalar value on-demand if one is not yet available. When
513   /// vectorizing a loop, we visit the definition of an instruction before its
514   /// uses. When visiting the definition, we either vectorize or scalarize the
515   /// instruction, creating an entry for it in the corresponding map. (In some
516   /// cases, such as induction variables, we will create both vector and scalar
517   /// entries.) Then, as we encounter uses of the definition, we derive values
518   /// for each scalar or vector use unless such a value is already available.
519   /// For example, if we scalarize a definition and one of its uses is vector,
520   /// we build the required vector on-demand with an insertelement sequence
521   /// when visiting the use. Otherwise, if the use is scalar, we can use the
522   /// existing scalar definition.
523   ///
524   /// Return a value in the new loop corresponding to \p V from the original
525   /// loop at unroll index \p Part. If the value has already been vectorized,
526   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
527   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
528   /// a new vector value on-demand by inserting the scalar values into a vector
529   /// with an insertelement sequence. If the value has been neither vectorized
530   /// nor scalarized, it must be loop invariant, so we simply broadcast the
531   /// value into a vector.
532   Value *getOrCreateVectorValue(Value *V, unsigned Part);
533 
534   /// Return a value in the new loop corresponding to \p V from the original
535   /// loop at unroll and vector indices \p Instance. If the value has been
536   /// vectorized but not scalarized, the necessary extractelement instruction
537   /// will be generated.
538   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
539 
540   /// Construct the vector value of a scalarized value \p V one lane at a time.
541   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
542 
543   /// Try to vectorize interleaved access group \p Group with the base address
544   /// given in \p Addr, optionally masking the vector operations if \p
545   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
546   /// values in the vectorized loop.
547   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
548                                 VPTransformState &State, VPValue *Addr,
549                                 VPValue *BlockInMask = nullptr);
550 
551   /// Vectorize Load and Store instructions with the base address given in \p
552   /// Addr, optionally masking the vector operations if \p BlockInMask is
553   /// non-null. Use \p State to translate given VPValues to IR values in the
554   /// vectorized loop.
555   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
556                                   VPValue *Addr, VPValue *StoredValue,
557                                   VPValue *BlockInMask);
558 
559   /// Set the debug location in the builder using the debug location in
560   /// the instruction.
561   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
562 
563   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
564   void fixNonInductionPHIs(void);
565 
566 protected:
567   friend class LoopVectorizationPlanner;
568 
569   /// A small list of PHINodes.
570   using PhiVector = SmallVector<PHINode *, 4>;
571 
572   /// A type for scalarized values in the new loop. Each value from the
573   /// original loop, when scalarized, is represented by UF x VF scalar values
574   /// in the new unrolled loop, where UF is the unroll factor and VF is the
575   /// vectorization factor.
576   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
577 
578   /// Set up the values of the IVs correctly when exiting the vector loop.
579   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
580                     Value *CountRoundDown, Value *EndValue,
581                     BasicBlock *MiddleBlock);
582 
583   /// Create a new induction variable inside L.
584   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
585                                    Value *Step, Instruction *DL);
586 
587   /// Handle all cross-iteration phis in the header.
588   void fixCrossIterationPHIs();
589 
590   /// Fix a first-order recurrence. This is the second phase of vectorizing
591   /// this phi node.
592   void fixFirstOrderRecurrence(PHINode *Phi);
593 
594   /// Fix a reduction cross-iteration phi. This is the second phase of
595   /// vectorizing this phi node.
596   void fixReduction(PHINode *Phi);
597 
598   /// Clear NSW/NUW flags from reduction instructions if necessary.
599   void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
600 
601   /// The Loop exit block may have single value PHI nodes with some
602   /// incoming value. While vectorizing we only handled real values
603   /// that were defined inside the loop and we should have one value for
604   /// each predecessor of its parent basic block. See PR14725.
605   void fixLCSSAPHIs();
606 
607   /// Iteratively sink the scalarized operands of a predicated instruction into
608   /// the block that was created for it.
609   void sinkScalarOperands(Instruction *PredInst);
610 
611   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
612   /// represented as.
613   void truncateToMinimalBitwidths();
614 
615   /// Create a broadcast instruction. This method generates a broadcast
616   /// instruction (shuffle) for loop invariant values and for the induction
617   /// value. If this is the induction variable then we extend it to N, N+1, ...
618   /// this is needed because each iteration in the loop corresponds to a SIMD
619   /// element.
620   virtual Value *getBroadcastInstrs(Value *V);
621 
622   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
623   /// to each vector element of Val. The sequence starts at StartIndex.
624   /// \p Opcode is relevant for FP induction variable.
625   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
626                                Instruction::BinaryOps Opcode =
627                                Instruction::BinaryOpsEnd);
628 
629   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
630   /// variable on which to base the steps, \p Step is the size of the step, and
631   /// \p EntryVal is the value from the original loop that maps to the steps.
632   /// Note that \p EntryVal doesn't have to be an induction variable - it
633   /// can also be a truncate instruction.
634   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
635                         const InductionDescriptor &ID);
636 
637   /// Create a vector induction phi node based on an existing scalar one. \p
638   /// EntryVal is the value from the original loop that maps to the vector phi
639   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
640   /// truncate instruction, instead of widening the original IV, we widen a
641   /// version of the IV truncated to \p EntryVal's type.
642   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
643                                        Value *Step, Instruction *EntryVal);
644 
645   /// Returns true if an instruction \p I should be scalarized instead of
646   /// vectorized for the chosen vectorization factor.
647   bool shouldScalarizeInstruction(Instruction *I) const;
648 
649   /// Returns true if we should generate a scalar version of \p IV.
650   bool needsScalarInduction(Instruction *IV) const;
651 
652   /// If there is a cast involved in the induction variable \p ID, which should
653   /// be ignored in the vectorized loop body, this function records the
654   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
655   /// cast. We had already proved that the casted Phi is equal to the uncasted
656   /// Phi in the vectorized loop (under a runtime guard), and therefore
657   /// there is no need to vectorize the cast - the same value can be used in the
658   /// vector loop for both the Phi and the cast.
659   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
660   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
661   ///
662   /// \p EntryVal is the value from the original loop that maps to the vector
663   /// phi node and is used to distinguish what is the IV currently being
664   /// processed - original one (if \p EntryVal is a phi corresponding to the
665   /// original IV) or the "newly-created" one based on the proof mentioned above
666   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
667   /// latter case \p EntryVal is a TruncInst and we must not record anything for
668   /// that IV, but it's error-prone to expect callers of this routine to care
669   /// about that, hence this explicit parameter.
670   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
671                                              const Instruction *EntryVal,
672                                              Value *VectorLoopValue,
673                                              unsigned Part,
674                                              unsigned Lane = UINT_MAX);
675 
676   /// Generate a shuffle sequence that will reverse the vector Vec.
677   virtual Value *reverseVector(Value *Vec);
678 
679   /// Returns (and creates if needed) the original loop trip count.
680   Value *getOrCreateTripCount(Loop *NewLoop);
681 
682   /// Returns (and creates if needed) the trip count of the widened loop.
683   Value *getOrCreateVectorTripCount(Loop *NewLoop);
684 
685   /// Returns a bitcasted value to the requested vector type.
686   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
687   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
688                                 const DataLayout &DL);
689 
690   /// Emit a bypass check to see if the vector trip count is zero, including if
691   /// it overflows.
692   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
693 
694   /// Emit a bypass check to see if all of the SCEV assumptions we've
695   /// had to make are correct.
696   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
697 
698   /// Emit bypass checks to check any memory assumptions we may have made.
699   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
700 
701   /// Compute the transformed value of Index at offset StartValue using step
702   /// StepValue.
703   /// For integer induction, returns StartValue + Index * StepValue.
704   /// For pointer induction, returns StartValue[Index * StepValue].
705   /// FIXME: The newly created binary instructions should contain nsw/nuw
706   /// flags, which can be found from the original scalar operations.
707   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
708                               const DataLayout &DL,
709                               const InductionDescriptor &ID) const;
710 
711   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
712   /// vector loop preheader, middle block and scalar preheader. Also
713   /// allocate a loop object for the new vector loop and return it.
714   Loop *createVectorLoopSkeleton(StringRef Prefix);
715 
716   /// Create new phi nodes for the induction variables to resume iteration count
717   /// in the scalar epilogue, from where the vectorized loop left off (given by
718   /// \p VectorTripCount).
719   void createInductionResumeValues(Loop *L, Value *VectorTripCount);
720 
721   /// Complete the loop skeleton by adding debug MDs, creating appropriate
722   /// conditional branches in the middle block, preparing the builder and
723   /// running the verifier. Take in the vector loop \p L as argument, and return
724   /// the preheader of the completed vector loop.
725   BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
726 
727   /// Add additional metadata to \p To that was not present on \p Orig.
728   ///
729   /// Currently this is used to add the noalias annotations based on the
730   /// inserted memchecks.  Use this for instructions that are *cloned* into the
731   /// vector loop.
732   void addNewMetadata(Instruction *To, const Instruction *Orig);
733 
734   /// Add metadata from one instruction to another.
735   ///
736   /// This includes both the original MDs from \p From and additional ones (\see
737   /// addNewMetadata).  Use this for *newly created* instructions in the vector
738   /// loop.
739   void addMetadata(Instruction *To, Instruction *From);
740 
741   /// Similar to the previous function but it adds the metadata to a
742   /// vector of instructions.
743   void addMetadata(ArrayRef<Value *> To, Instruction *From);
744 
745   /// The original loop.
746   Loop *OrigLoop;
747 
748   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
749   /// dynamic knowledge to simplify SCEV expressions and converts them to a
750   /// more usable form.
751   PredicatedScalarEvolution &PSE;
752 
753   /// Loop Info.
754   LoopInfo *LI;
755 
756   /// Dominator Tree.
757   DominatorTree *DT;
758 
759   /// Alias Analysis.
760   AAResults *AA;
761 
762   /// Target Library Info.
763   const TargetLibraryInfo *TLI;
764 
765   /// Target Transform Info.
766   const TargetTransformInfo *TTI;
767 
768   /// Assumption Cache.
769   AssumptionCache *AC;
770 
771   /// Interface to emit optimization remarks.
772   OptimizationRemarkEmitter *ORE;
773 
774   /// LoopVersioning.  It's only set up (non-null) if memchecks were
775   /// used.
776   ///
777   /// This is currently only used to add no-alias metadata based on the
778   /// memchecks.  The actually versioning is performed manually.
779   std::unique_ptr<LoopVersioning> LVer;
780 
781   /// The vectorization SIMD factor to use. Each vector will have this many
782   /// vector elements.
783   ElementCount VF;
784 
785   /// The vectorization unroll factor to use. Each scalar is vectorized to this
786   /// many different vector instructions.
787   unsigned UF;
788 
789   /// The builder that we use
790   IRBuilder<> Builder;
791 
792   // --- Vectorization state ---
793 
794   /// The vector-loop preheader.
795   BasicBlock *LoopVectorPreHeader;
796 
797   /// The scalar-loop preheader.
798   BasicBlock *LoopScalarPreHeader;
799 
800   /// Middle Block between the vector and the scalar.
801   BasicBlock *LoopMiddleBlock;
802 
803   /// The ExitBlock of the scalar loop.
804   BasicBlock *LoopExitBlock;
805 
806   /// The vector loop body.
807   BasicBlock *LoopVectorBody;
808 
809   /// The scalar loop body.
810   BasicBlock *LoopScalarBody;
811 
812   /// A list of all bypass blocks. The first block is the entry of the loop.
813   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
814 
815   /// The new Induction variable which was added to the new block.
816   PHINode *Induction = nullptr;
817 
818   /// The induction variable of the old basic block.
819   PHINode *OldInduction = nullptr;
820 
821   /// Maps values from the original loop to their corresponding values in the
822   /// vectorized loop. A key value can map to either vector values, scalar
823   /// values or both kinds of values, depending on whether the key was
824   /// vectorized and scalarized.
825   VectorizerValueMap VectorLoopValueMap;
826 
827   /// Store instructions that were predicated.
828   SmallVector<Instruction *, 4> PredicatedInstructions;
829 
830   /// Trip count of the original loop.
831   Value *TripCount = nullptr;
832 
833   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
834   Value *VectorTripCount = nullptr;
835 
836   /// The legality analysis.
837   LoopVectorizationLegality *Legal;
838 
839   /// The profitablity analysis.
840   LoopVectorizationCostModel *Cost;
841 
842   // Record whether runtime checks are added.
843   bool AddedSafetyChecks = false;
844 
845   // Holds the end values for each induction variable. We save the end values
846   // so we can later fix-up the external users of the induction variables.
847   DenseMap<PHINode *, Value *> IVEndValues;
848 
849   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
850   // fixed up at the end of vector code generation.
851   SmallVector<PHINode *, 8> OrigPHIsToFix;
852 
853   /// BFI and PSI are used to check for profile guided size optimizations.
854   BlockFrequencyInfo *BFI;
855   ProfileSummaryInfo *PSI;
856 
857   // Whether this loop should be optimized for size based on profile guided size
858   // optimizatios.
859   bool OptForSizeBasedOnProfile;
860 };
861 
862 class InnerLoopUnroller : public InnerLoopVectorizer {
863 public:
864   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
865                     LoopInfo *LI, DominatorTree *DT,
866                     const TargetLibraryInfo *TLI,
867                     const TargetTransformInfo *TTI, AssumptionCache *AC,
868                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
869                     LoopVectorizationLegality *LVL,
870                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
871                     ProfileSummaryInfo *PSI)
872       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
873                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
874                             BFI, PSI) {}
875 
876 private:
877   Value *getBroadcastInstrs(Value *V) override;
878   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
879                        Instruction::BinaryOps Opcode =
880                        Instruction::BinaryOpsEnd) override;
881   Value *reverseVector(Value *Vec) override;
882 };
883 
884 } // end namespace llvm
885 
886 /// Look for a meaningful debug location on the instruction or it's
887 /// operands.
888 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
889   if (!I)
890     return I;
891 
892   DebugLoc Empty;
893   if (I->getDebugLoc() != Empty)
894     return I;
895 
896   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
897     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
898       if (OpInst->getDebugLoc() != Empty)
899         return OpInst;
900   }
901 
902   return I;
903 }
904 
905 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
906   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
907     const DILocation *DIL = Inst->getDebugLoc();
908     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
909         !isa<DbgInfoIntrinsic>(Inst)) {
910       assert(!VF.isScalable() && "scalable vectors not yet supported.");
911       auto NewDIL =
912           DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
913       if (NewDIL)
914         B.SetCurrentDebugLocation(NewDIL.getValue());
915       else
916         LLVM_DEBUG(dbgs()
917                    << "Failed to create new discriminator: "
918                    << DIL->getFilename() << " Line: " << DIL->getLine());
919     }
920     else
921       B.SetCurrentDebugLocation(DIL);
922   } else
923     B.SetCurrentDebugLocation(DebugLoc());
924 }
925 
926 /// Write a record \p DebugMsg about vectorization failure to the debug
927 /// output stream. If \p I is passed, it is an instruction that prevents
928 /// vectorization.
929 #ifndef NDEBUG
930 static void debugVectorizationFailure(const StringRef DebugMsg,
931     Instruction *I) {
932   dbgs() << "LV: Not vectorizing: " << DebugMsg;
933   if (I != nullptr)
934     dbgs() << " " << *I;
935   else
936     dbgs() << '.';
937   dbgs() << '\n';
938 }
939 #endif
940 
941 /// Create an analysis remark that explains why vectorization failed
942 ///
943 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
944 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
945 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
946 /// the location of the remark.  \return the remark object that can be
947 /// streamed to.
948 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
949     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
950   Value *CodeRegion = TheLoop->getHeader();
951   DebugLoc DL = TheLoop->getStartLoc();
952 
953   if (I) {
954     CodeRegion = I->getParent();
955     // If there is no debug location attached to the instruction, revert back to
956     // using the loop's.
957     if (I->getDebugLoc())
958       DL = I->getDebugLoc();
959   }
960 
961   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
962   R << "loop not vectorized: ";
963   return R;
964 }
965 
966 namespace llvm {
967 
968 void reportVectorizationFailure(const StringRef DebugMsg,
969     const StringRef OREMsg, const StringRef ORETag,
970     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
971   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
972   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
973   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
974                 ORETag, TheLoop, I) << OREMsg);
975 }
976 
977 } // end namespace llvm
978 
979 #ifndef NDEBUG
980 /// \return string containing a file name and a line # for the given loop.
981 static std::string getDebugLocString(const Loop *L) {
982   std::string Result;
983   if (L) {
984     raw_string_ostream OS(Result);
985     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
986       LoopDbgLoc.print(OS);
987     else
988       // Just print the module name.
989       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
990     OS.flush();
991   }
992   return Result;
993 }
994 #endif
995 
996 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
997                                          const Instruction *Orig) {
998   // If the loop was versioned with memchecks, add the corresponding no-alias
999   // metadata.
1000   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
1001     LVer->annotateInstWithNoAlias(To, Orig);
1002 }
1003 
1004 void InnerLoopVectorizer::addMetadata(Instruction *To,
1005                                       Instruction *From) {
1006   propagateMetadata(To, From);
1007   addNewMetadata(To, From);
1008 }
1009 
1010 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1011                                       Instruction *From) {
1012   for (Value *V : To) {
1013     if (Instruction *I = dyn_cast<Instruction>(V))
1014       addMetadata(I, From);
1015   }
1016 }
1017 
1018 namespace llvm {
1019 
1020 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1021 // lowered.
1022 enum ScalarEpilogueLowering {
1023 
1024   // The default: allowing scalar epilogues.
1025   CM_ScalarEpilogueAllowed,
1026 
1027   // Vectorization with OptForSize: don't allow epilogues.
1028   CM_ScalarEpilogueNotAllowedOptSize,
1029 
1030   // A special case of vectorisation with OptForSize: loops with a very small
1031   // trip count are considered for vectorization under OptForSize, thereby
1032   // making sure the cost of their loop body is dominant, free of runtime
1033   // guards and scalar iteration overheads.
1034   CM_ScalarEpilogueNotAllowedLowTripLoop,
1035 
1036   // Loop hint predicate indicating an epilogue is undesired.
1037   CM_ScalarEpilogueNotNeededUsePredicate
1038 };
1039 
1040 /// LoopVectorizationCostModel - estimates the expected speedups due to
1041 /// vectorization.
1042 /// In many cases vectorization is not profitable. This can happen because of
1043 /// a number of reasons. In this class we mainly attempt to predict the
1044 /// expected speedup/slowdowns due to the supported instruction set. We use the
1045 /// TargetTransformInfo to query the different backends for the cost of
1046 /// different operations.
1047 class LoopVectorizationCostModel {
1048 public:
1049   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1050                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1051                              LoopVectorizationLegality *Legal,
1052                              const TargetTransformInfo &TTI,
1053                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1054                              AssumptionCache *AC,
1055                              OptimizationRemarkEmitter *ORE, const Function *F,
1056                              const LoopVectorizeHints *Hints,
1057                              InterleavedAccessInfo &IAI)
1058       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1059         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1060         Hints(Hints), InterleaveInfo(IAI) {}
1061 
1062   /// \return An upper bound for the vectorization factor, or None if
1063   /// vectorization and interleaving should be avoided up front.
1064   Optional<unsigned> computeMaxVF(unsigned UserVF, unsigned UserIC);
1065 
1066   /// \return True if runtime checks are required for vectorization, and false
1067   /// otherwise.
1068   bool runtimeChecksRequired();
1069 
1070   /// \return The most profitable vectorization factor and the cost of that VF.
1071   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
1072   /// then this vectorization factor will be selected if vectorization is
1073   /// possible.
1074   VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
1075 
1076   /// Setup cost-based decisions for user vectorization factor.
1077   void selectUserVectorizationFactor(ElementCount UserVF) {
1078     collectUniformsAndScalars(UserVF);
1079     collectInstsToScalarize(UserVF);
1080   }
1081 
1082   /// \return The size (in bits) of the smallest and widest types in the code
1083   /// that needs to be vectorized. We ignore values that remain scalar such as
1084   /// 64 bit loop indices.
1085   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1086 
1087   /// \return The desired interleave count.
1088   /// If interleave count has been specified by metadata it will be returned.
1089   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1090   /// are the selected vectorization factor and the cost of the selected VF.
1091   unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1092 
1093   /// Memory access instruction may be vectorized in more than one way.
1094   /// Form of instruction after vectorization depends on cost.
1095   /// This function takes cost-based decisions for Load/Store instructions
1096   /// and collects them in a map. This decisions map is used for building
1097   /// the lists of loop-uniform and loop-scalar instructions.
1098   /// The calculated cost is saved with widening decision in order to
1099   /// avoid redundant calculations.
1100   void setCostBasedWideningDecision(ElementCount VF);
1101 
1102   /// A struct that represents some properties of the register usage
1103   /// of a loop.
1104   struct RegisterUsage {
1105     /// Holds the number of loop invariant values that are used in the loop.
1106     /// The key is ClassID of target-provided register class.
1107     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1108     /// Holds the maximum number of concurrent live intervals in the loop.
1109     /// The key is ClassID of target-provided register class.
1110     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1111   };
1112 
1113   /// \return Returns information about the register usages of the loop for the
1114   /// given vectorization factors.
1115   SmallVector<RegisterUsage, 8>
1116   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1117 
1118   /// Collect values we want to ignore in the cost model.
1119   void collectValuesToIgnore();
1120 
1121   /// Split reductions into those that happen in the loop, and those that happen
1122   /// outside. In loop reductions are collected into InLoopReductionChains.
1123   void collectInLoopReductions();
1124 
1125   /// \returns The smallest bitwidth each instruction can be represented with.
1126   /// The vector equivalents of these instructions should be truncated to this
1127   /// type.
1128   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1129     return MinBWs;
1130   }
1131 
1132   /// \returns True if it is more profitable to scalarize instruction \p I for
1133   /// vectorization factor \p VF.
1134   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1135     assert(VF.isVector() &&
1136            "Profitable to scalarize relevant only for VF > 1.");
1137 
1138     // Cost model is not run in the VPlan-native path - return conservative
1139     // result until this changes.
1140     if (EnableVPlanNativePath)
1141       return false;
1142 
1143     auto Scalars = InstsToScalarize.find(VF);
1144     assert(Scalars != InstsToScalarize.end() &&
1145            "VF not yet analyzed for scalarization profitability");
1146     return Scalars->second.find(I) != Scalars->second.end();
1147   }
1148 
1149   /// Returns true if \p I is known to be uniform after vectorization.
1150   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1151     if (VF.isScalar())
1152       return true;
1153 
1154     // Cost model is not run in the VPlan-native path - return conservative
1155     // result until this changes.
1156     if (EnableVPlanNativePath)
1157       return false;
1158 
1159     auto UniformsPerVF = Uniforms.find(VF);
1160     assert(UniformsPerVF != Uniforms.end() &&
1161            "VF not yet analyzed for uniformity");
1162     return UniformsPerVF->second.count(I);
1163   }
1164 
1165   /// Returns true if \p I is known to be scalar after vectorization.
1166   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1167     if (VF.isScalar())
1168       return true;
1169 
1170     // Cost model is not run in the VPlan-native path - return conservative
1171     // result until this changes.
1172     if (EnableVPlanNativePath)
1173       return false;
1174 
1175     auto ScalarsPerVF = Scalars.find(VF);
1176     assert(ScalarsPerVF != Scalars.end() &&
1177            "Scalar values are not calculated for VF");
1178     return ScalarsPerVF->second.count(I);
1179   }
1180 
1181   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1182   /// for vectorization factor \p VF.
1183   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1184     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1185            !isProfitableToScalarize(I, VF) &&
1186            !isScalarAfterVectorization(I, VF);
1187   }
1188 
1189   /// Decision that was taken during cost calculation for memory instruction.
1190   enum InstWidening {
1191     CM_Unknown,
1192     CM_Widen,         // For consecutive accesses with stride +1.
1193     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1194     CM_Interleave,
1195     CM_GatherScatter,
1196     CM_Scalarize
1197   };
1198 
1199   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1200   /// instruction \p I and vector width \p VF.
1201   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1202                            unsigned Cost) {
1203     assert(VF.isVector() && "Expected VF >=2");
1204     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1205   }
1206 
1207   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1208   /// interleaving group \p Grp and vector width \p VF.
1209   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1210                            ElementCount VF, InstWidening W, unsigned Cost) {
1211     assert(VF.isVector() && "Expected VF >=2");
1212     /// Broadcast this decicion to all instructions inside the group.
1213     /// But the cost will be assigned to one instruction only.
1214     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1215       if (auto *I = Grp->getMember(i)) {
1216         if (Grp->getInsertPos() == I)
1217           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1218         else
1219           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1220       }
1221     }
1222   }
1223 
1224   /// Return the cost model decision for the given instruction \p I and vector
1225   /// width \p VF. Return CM_Unknown if this instruction did not pass
1226   /// through the cost modeling.
1227   InstWidening getWideningDecision(Instruction *I, ElementCount VF) {
1228     assert(!VF.isScalable() && "scalable vectors not yet supported.");
1229     assert(VF.isVector() && "Expected VF >=2");
1230 
1231     // Cost model is not run in the VPlan-native path - return conservative
1232     // result until this changes.
1233     if (EnableVPlanNativePath)
1234       return CM_GatherScatter;
1235 
1236     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1237     auto Itr = WideningDecisions.find(InstOnVF);
1238     if (Itr == WideningDecisions.end())
1239       return CM_Unknown;
1240     return Itr->second.first;
1241   }
1242 
1243   /// Return the vectorization cost for the given instruction \p I and vector
1244   /// width \p VF.
1245   unsigned getWideningCost(Instruction *I, ElementCount VF) {
1246     assert(VF.isVector() && "Expected VF >=2");
1247     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1248     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1249            "The cost is not calculated");
1250     return WideningDecisions[InstOnVF].second;
1251   }
1252 
1253   /// Return True if instruction \p I is an optimizable truncate whose operand
1254   /// is an induction variable. Such a truncate will be removed by adding a new
1255   /// induction variable with the destination type.
1256   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1257     // If the instruction is not a truncate, return false.
1258     auto *Trunc = dyn_cast<TruncInst>(I);
1259     if (!Trunc)
1260       return false;
1261 
1262     // Get the source and destination types of the truncate.
1263     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1264     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1265 
1266     // If the truncate is free for the given types, return false. Replacing a
1267     // free truncate with an induction variable would add an induction variable
1268     // update instruction to each iteration of the loop. We exclude from this
1269     // check the primary induction variable since it will need an update
1270     // instruction regardless.
1271     Value *Op = Trunc->getOperand(0);
1272     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1273       return false;
1274 
1275     // If the truncated value is not an induction variable, return false.
1276     return Legal->isInductionPhi(Op);
1277   }
1278 
1279   /// Collects the instructions to scalarize for each predicated instruction in
1280   /// the loop.
1281   void collectInstsToScalarize(ElementCount VF);
1282 
1283   /// Collect Uniform and Scalar values for the given \p VF.
1284   /// The sets depend on CM decision for Load/Store instructions
1285   /// that may be vectorized as interleave, gather-scatter or scalarized.
1286   void collectUniformsAndScalars(ElementCount VF) {
1287     // Do the analysis once.
1288     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1289       return;
1290     setCostBasedWideningDecision(VF);
1291     collectLoopUniforms(VF);
1292     collectLoopScalars(VF);
1293   }
1294 
1295   /// Returns true if the target machine supports masked store operation
1296   /// for the given \p DataType and kind of access to \p Ptr.
1297   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) {
1298     return Legal->isConsecutivePtr(Ptr) &&
1299            TTI.isLegalMaskedStore(DataType, Alignment);
1300   }
1301 
1302   /// Returns true if the target machine supports masked load operation
1303   /// for the given \p DataType and kind of access to \p Ptr.
1304   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) {
1305     return Legal->isConsecutivePtr(Ptr) &&
1306            TTI.isLegalMaskedLoad(DataType, Alignment);
1307   }
1308 
1309   /// Returns true if the target machine supports masked scatter operation
1310   /// for the given \p DataType.
1311   bool isLegalMaskedScatter(Type *DataType, Align Alignment) {
1312     return TTI.isLegalMaskedScatter(DataType, Alignment);
1313   }
1314 
1315   /// Returns true if the target machine supports masked gather operation
1316   /// for the given \p DataType.
1317   bool isLegalMaskedGather(Type *DataType, Align Alignment) {
1318     return TTI.isLegalMaskedGather(DataType, Alignment);
1319   }
1320 
1321   /// Returns true if the target machine can represent \p V as a masked gather
1322   /// or scatter operation.
1323   bool isLegalGatherOrScatter(Value *V) {
1324     bool LI = isa<LoadInst>(V);
1325     bool SI = isa<StoreInst>(V);
1326     if (!LI && !SI)
1327       return false;
1328     auto *Ty = getMemInstValueType(V);
1329     Align Align = getLoadStoreAlignment(V);
1330     return (LI && isLegalMaskedGather(Ty, Align)) ||
1331            (SI && isLegalMaskedScatter(Ty, Align));
1332   }
1333 
1334   /// Returns true if \p I is an instruction that will be scalarized with
1335   /// predication. Such instructions include conditional stores and
1336   /// instructions that may divide by zero.
1337   /// If a non-zero VF has been calculated, we check if I will be scalarized
1338   /// predication for that VF.
1339   bool isScalarWithPredication(Instruction *I,
1340                                ElementCount VF = ElementCount::getFixed(1));
1341 
1342   // Returns true if \p I is an instruction that will be predicated either
1343   // through scalar predication or masked load/store or masked gather/scatter.
1344   // Superset of instructions that return true for isScalarWithPredication.
1345   bool isPredicatedInst(Instruction *I) {
1346     if (!blockNeedsPredication(I->getParent()))
1347       return false;
1348     // Loads and stores that need some form of masked operation are predicated
1349     // instructions.
1350     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1351       return Legal->isMaskRequired(I);
1352     return isScalarWithPredication(I);
1353   }
1354 
1355   /// Returns true if \p I is a memory instruction with consecutive memory
1356   /// access that can be widened.
1357   bool
1358   memoryInstructionCanBeWidened(Instruction *I,
1359                                 ElementCount VF = ElementCount::getFixed(1));
1360 
1361   /// Returns true if \p I is a memory instruction in an interleaved-group
1362   /// of memory accesses that can be vectorized with wide vector loads/stores
1363   /// and shuffles.
1364   bool
1365   interleavedAccessCanBeWidened(Instruction *I,
1366                                 ElementCount VF = ElementCount::getFixed(1));
1367 
1368   /// Check if \p Instr belongs to any interleaved access group.
1369   bool isAccessInterleaved(Instruction *Instr) {
1370     return InterleaveInfo.isInterleaved(Instr);
1371   }
1372 
1373   /// Get the interleaved access group that \p Instr belongs to.
1374   const InterleaveGroup<Instruction> *
1375   getInterleavedAccessGroup(Instruction *Instr) {
1376     return InterleaveInfo.getInterleaveGroup(Instr);
1377   }
1378 
1379   /// Returns true if an interleaved group requires a scalar iteration
1380   /// to handle accesses with gaps, and there is nothing preventing us from
1381   /// creating a scalar epilogue.
1382   bool requiresScalarEpilogue() const {
1383     return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1384   }
1385 
1386   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1387   /// loop hint annotation.
1388   bool isScalarEpilogueAllowed() const {
1389     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1390   }
1391 
1392   /// Returns true if all loop blocks should be masked to fold tail loop.
1393   bool foldTailByMasking() const { return FoldTailByMasking; }
1394 
1395   bool blockNeedsPredication(BasicBlock *BB) {
1396     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1397   }
1398 
1399   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1400   /// nodes to the chain of instructions representing the reductions. Uses a
1401   /// MapVector to ensure deterministic iteration order.
1402   using ReductionChainMap =
1403       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1404 
1405   /// Return the chain of instructions representing an inloop reduction.
1406   const ReductionChainMap &getInLoopReductionChains() const {
1407     return InLoopReductionChains;
1408   }
1409 
1410   /// Returns true if the Phi is part of an inloop reduction.
1411   bool isInLoopReduction(PHINode *Phi) const {
1412     return InLoopReductionChains.count(Phi);
1413   }
1414 
1415   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1416   /// with factor VF.  Return the cost of the instruction, including
1417   /// scalarization overhead if it's needed.
1418   unsigned getVectorIntrinsicCost(CallInst *CI, ElementCount VF);
1419 
1420   /// Estimate cost of a call instruction CI if it were vectorized with factor
1421   /// VF. Return the cost of the instruction, including scalarization overhead
1422   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1423   /// scalarized -
1424   /// i.e. either vector version isn't available, or is too expensive.
1425   unsigned getVectorCallCost(CallInst *CI, ElementCount VF,
1426                              bool &NeedToScalarize);
1427 
1428   /// Invalidates decisions already taken by the cost model.
1429   void invalidateCostModelingDecisions() {
1430     WideningDecisions.clear();
1431     Uniforms.clear();
1432     Scalars.clear();
1433   }
1434 
1435 private:
1436   unsigned NumPredStores = 0;
1437 
1438   /// \return An upper bound for the vectorization factor, a power-of-2 larger
1439   /// than zero. One is returned if vectorization should best be avoided due
1440   /// to cost.
1441   unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
1442 
1443   /// The vectorization cost is a combination of the cost itself and a boolean
1444   /// indicating whether any of the contributing operations will actually
1445   /// operate on
1446   /// vector values after type legalization in the backend. If this latter value
1447   /// is
1448   /// false, then all operations will be scalarized (i.e. no vectorization has
1449   /// actually taken place).
1450   using VectorizationCostTy = std::pair<unsigned, bool>;
1451 
1452   /// Returns the expected execution cost. The unit of the cost does
1453   /// not matter because we use the 'cost' units to compare different
1454   /// vector widths. The cost that is returned is *not* normalized by
1455   /// the factor width.
1456   VectorizationCostTy expectedCost(ElementCount VF);
1457 
1458   /// Returns the execution time cost of an instruction for a given vector
1459   /// width. Vector width of one means scalar.
1460   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1461 
1462   /// The cost-computation logic from getInstructionCost which provides
1463   /// the vector type as an output parameter.
1464   unsigned getInstructionCost(Instruction *I, ElementCount VF, Type *&VectorTy);
1465 
1466   /// Calculate vectorization cost of memory instruction \p I.
1467   unsigned getMemoryInstructionCost(Instruction *I, ElementCount VF);
1468 
1469   /// The cost computation for scalarized memory instruction.
1470   unsigned getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1471 
1472   /// The cost computation for interleaving group of memory instructions.
1473   unsigned getInterleaveGroupCost(Instruction *I, ElementCount VF);
1474 
1475   /// The cost computation for Gather/Scatter instruction.
1476   unsigned getGatherScatterCost(Instruction *I, ElementCount VF);
1477 
1478   /// The cost computation for widening instruction \p I with consecutive
1479   /// memory access.
1480   unsigned getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1481 
1482   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1483   /// Load: scalar load + broadcast.
1484   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1485   /// element)
1486   unsigned getUniformMemOpCost(Instruction *I, ElementCount VF);
1487 
1488   /// Estimate the overhead of scalarizing an instruction. This is a
1489   /// convenience wrapper for the type-based getScalarizationOverhead API.
1490   unsigned getScalarizationOverhead(Instruction *I, ElementCount VF);
1491 
1492   /// Returns whether the instruction is a load or store and will be a emitted
1493   /// as a vector operation.
1494   bool isConsecutiveLoadOrStore(Instruction *I);
1495 
1496   /// Returns true if an artificially high cost for emulated masked memrefs
1497   /// should be used.
1498   bool useEmulatedMaskMemRefHack(Instruction *I);
1499 
1500   /// Map of scalar integer values to the smallest bitwidth they can be legally
1501   /// represented as. The vector equivalents of these values should be truncated
1502   /// to this type.
1503   MapVector<Instruction *, uint64_t> MinBWs;
1504 
1505   /// A type representing the costs for instructions if they were to be
1506   /// scalarized rather than vectorized. The entries are Instruction-Cost
1507   /// pairs.
1508   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1509 
1510   /// A set containing all BasicBlocks that are known to present after
1511   /// vectorization as a predicated block.
1512   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1513 
1514   /// Records whether it is allowed to have the original scalar loop execute at
1515   /// least once. This may be needed as a fallback loop in case runtime
1516   /// aliasing/dependence checks fail, or to handle the tail/remainder
1517   /// iterations when the trip count is unknown or doesn't divide by the VF,
1518   /// or as a peel-loop to handle gaps in interleave-groups.
1519   /// Under optsize and when the trip count is very small we don't allow any
1520   /// iterations to execute in the scalar loop.
1521   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1522 
1523   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1524   bool FoldTailByMasking = false;
1525 
1526   /// A map holding scalar costs for different vectorization factors. The
1527   /// presence of a cost for an instruction in the mapping indicates that the
1528   /// instruction will be scalarized when vectorizing with the associated
1529   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1530   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1531 
1532   /// Holds the instructions known to be uniform after vectorization.
1533   /// The data is collected per VF.
1534   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1535 
1536   /// Holds the instructions known to be scalar after vectorization.
1537   /// The data is collected per VF.
1538   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1539 
1540   /// Holds the instructions (address computations) that are forced to be
1541   /// scalarized.
1542   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1543 
1544   /// PHINodes of the reductions that should be expanded in-loop along with
1545   /// their associated chains of reduction operations, in program order from top
1546   /// (PHI) to bottom
1547   ReductionChainMap InLoopReductionChains;
1548 
1549   /// Returns the expected difference in cost from scalarizing the expression
1550   /// feeding a predicated instruction \p PredInst. The instructions to
1551   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1552   /// non-negative return value implies the expression will be scalarized.
1553   /// Currently, only single-use chains are considered for scalarization.
1554   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1555                               ElementCount VF);
1556 
1557   /// Collect the instructions that are uniform after vectorization. An
1558   /// instruction is uniform if we represent it with a single scalar value in
1559   /// the vectorized loop corresponding to each vector iteration. Examples of
1560   /// uniform instructions include pointer operands of consecutive or
1561   /// interleaved memory accesses. Note that although uniformity implies an
1562   /// instruction will be scalar, the reverse is not true. In general, a
1563   /// scalarized instruction will be represented by VF scalar values in the
1564   /// vectorized loop, each corresponding to an iteration of the original
1565   /// scalar loop.
1566   void collectLoopUniforms(ElementCount VF);
1567 
1568   /// Collect the instructions that are scalar after vectorization. An
1569   /// instruction is scalar if it is known to be uniform or will be scalarized
1570   /// during vectorization. Non-uniform scalarized instructions will be
1571   /// represented by VF values in the vectorized loop, each corresponding to an
1572   /// iteration of the original scalar loop.
1573   void collectLoopScalars(ElementCount VF);
1574 
1575   /// Keeps cost model vectorization decision and cost for instructions.
1576   /// Right now it is used for memory instructions only.
1577   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1578                                 std::pair<InstWidening, unsigned>>;
1579 
1580   DecisionList WideningDecisions;
1581 
1582   /// Returns true if \p V is expected to be vectorized and it needs to be
1583   /// extracted.
1584   bool needsExtract(Value *V, ElementCount VF) const {
1585     Instruction *I = dyn_cast<Instruction>(V);
1586     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1587         TheLoop->isLoopInvariant(I))
1588       return false;
1589 
1590     // Assume we can vectorize V (and hence we need extraction) if the
1591     // scalars are not computed yet. This can happen, because it is called
1592     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1593     // the scalars are collected. That should be a safe assumption in most
1594     // cases, because we check if the operands have vectorizable types
1595     // beforehand in LoopVectorizationLegality.
1596     return Scalars.find(VF) == Scalars.end() ||
1597            !isScalarAfterVectorization(I, VF);
1598   };
1599 
1600   /// Returns a range containing only operands needing to be extracted.
1601   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1602                                                    ElementCount VF) {
1603     return SmallVector<Value *, 4>(make_filter_range(
1604         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1605   }
1606 
1607 public:
1608   /// The loop that we evaluate.
1609   Loop *TheLoop;
1610 
1611   /// Predicated scalar evolution analysis.
1612   PredicatedScalarEvolution &PSE;
1613 
1614   /// Loop Info analysis.
1615   LoopInfo *LI;
1616 
1617   /// Vectorization legality.
1618   LoopVectorizationLegality *Legal;
1619 
1620   /// Vector target information.
1621   const TargetTransformInfo &TTI;
1622 
1623   /// Target Library Info.
1624   const TargetLibraryInfo *TLI;
1625 
1626   /// Demanded bits analysis.
1627   DemandedBits *DB;
1628 
1629   /// Assumption cache.
1630   AssumptionCache *AC;
1631 
1632   /// Interface to emit optimization remarks.
1633   OptimizationRemarkEmitter *ORE;
1634 
1635   const Function *TheFunction;
1636 
1637   /// Loop Vectorize Hint.
1638   const LoopVectorizeHints *Hints;
1639 
1640   /// The interleave access information contains groups of interleaved accesses
1641   /// with the same stride and close to each other.
1642   InterleavedAccessInfo &InterleaveInfo;
1643 
1644   /// Values to ignore in the cost model.
1645   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1646 
1647   /// Values to ignore in the cost model when VF > 1.
1648   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1649 };
1650 
1651 } // end namespace llvm
1652 
1653 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1654 // vectorization. The loop needs to be annotated with #pragma omp simd
1655 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1656 // vector length information is not provided, vectorization is not considered
1657 // explicit. Interleave hints are not allowed either. These limitations will be
1658 // relaxed in the future.
1659 // Please, note that we are currently forced to abuse the pragma 'clang
1660 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1661 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1662 // provides *explicit vectorization hints* (LV can bypass legal checks and
1663 // assume that vectorization is legal). However, both hints are implemented
1664 // using the same metadata (llvm.loop.vectorize, processed by
1665 // LoopVectorizeHints). This will be fixed in the future when the native IR
1666 // representation for pragma 'omp simd' is introduced.
1667 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1668                                    OptimizationRemarkEmitter *ORE) {
1669   assert(!OuterLp->isInnermost() && "This is not an outer loop");
1670   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1671 
1672   // Only outer loops with an explicit vectorization hint are supported.
1673   // Unannotated outer loops are ignored.
1674   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1675     return false;
1676 
1677   Function *Fn = OuterLp->getHeader()->getParent();
1678   if (!Hints.allowVectorization(Fn, OuterLp,
1679                                 true /*VectorizeOnlyWhenForced*/)) {
1680     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1681     return false;
1682   }
1683 
1684   if (Hints.getInterleave() > 1) {
1685     // TODO: Interleave support is future work.
1686     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1687                          "outer loops.\n");
1688     Hints.emitRemarkWithHints();
1689     return false;
1690   }
1691 
1692   return true;
1693 }
1694 
1695 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1696                                   OptimizationRemarkEmitter *ORE,
1697                                   SmallVectorImpl<Loop *> &V) {
1698   // Collect inner loops and outer loops without irreducible control flow. For
1699   // now, only collect outer loops that have explicit vectorization hints. If we
1700   // are stress testing the VPlan H-CFG construction, we collect the outermost
1701   // loop of every loop nest.
1702   if (L.isInnermost() || VPlanBuildStressTest ||
1703       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1704     LoopBlocksRPO RPOT(&L);
1705     RPOT.perform(LI);
1706     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1707       V.push_back(&L);
1708       // TODO: Collect inner loops inside marked outer loops in case
1709       // vectorization fails for the outer loop. Do not invoke
1710       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1711       // already known to be reducible. We can use an inherited attribute for
1712       // that.
1713       return;
1714     }
1715   }
1716   for (Loop *InnerL : L)
1717     collectSupportedLoops(*InnerL, LI, ORE, V);
1718 }
1719 
1720 namespace {
1721 
1722 /// The LoopVectorize Pass.
1723 struct LoopVectorize : public FunctionPass {
1724   /// Pass identification, replacement for typeid
1725   static char ID;
1726 
1727   LoopVectorizePass Impl;
1728 
1729   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1730                          bool VectorizeOnlyWhenForced = false)
1731       : FunctionPass(ID),
1732         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
1733     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1734   }
1735 
1736   bool runOnFunction(Function &F) override {
1737     if (skipFunction(F))
1738       return false;
1739 
1740     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1741     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1742     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1743     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1744     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1745     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1746     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1747     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1748     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1749     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1750     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1751     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1752     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1753 
1754     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1755         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1756 
1757     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1758                         GetLAA, *ORE, PSI).MadeAnyChange;
1759   }
1760 
1761   void getAnalysisUsage(AnalysisUsage &AU) const override {
1762     AU.addRequired<AssumptionCacheTracker>();
1763     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1764     AU.addRequired<DominatorTreeWrapperPass>();
1765     AU.addRequired<LoopInfoWrapperPass>();
1766     AU.addRequired<ScalarEvolutionWrapperPass>();
1767     AU.addRequired<TargetTransformInfoWrapperPass>();
1768     AU.addRequired<AAResultsWrapperPass>();
1769     AU.addRequired<LoopAccessLegacyAnalysis>();
1770     AU.addRequired<DemandedBitsWrapperPass>();
1771     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1772     AU.addRequired<InjectTLIMappingsLegacy>();
1773 
1774     // We currently do not preserve loopinfo/dominator analyses with outer loop
1775     // vectorization. Until this is addressed, mark these analyses as preserved
1776     // only for non-VPlan-native path.
1777     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1778     if (!EnableVPlanNativePath) {
1779       AU.addPreserved<LoopInfoWrapperPass>();
1780       AU.addPreserved<DominatorTreeWrapperPass>();
1781     }
1782 
1783     AU.addPreserved<BasicAAWrapperPass>();
1784     AU.addPreserved<GlobalsAAWrapperPass>();
1785     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1786   }
1787 };
1788 
1789 } // end anonymous namespace
1790 
1791 //===----------------------------------------------------------------------===//
1792 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1793 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1794 //===----------------------------------------------------------------------===//
1795 
1796 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1797   // We need to place the broadcast of invariant variables outside the loop,
1798   // but only if it's proven safe to do so. Else, broadcast will be inside
1799   // vector loop body.
1800   Instruction *Instr = dyn_cast<Instruction>(V);
1801   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1802                      (!Instr ||
1803                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1804   // Place the code for broadcasting invariant variables in the new preheader.
1805   IRBuilder<>::InsertPointGuard Guard(Builder);
1806   if (SafeToHoist)
1807     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1808 
1809   // Broadcast the scalar into all locations in the vector.
1810   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1811 
1812   return Shuf;
1813 }
1814 
1815 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1816     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1817   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1818          "Expected either an induction phi-node or a truncate of it!");
1819   Value *Start = II.getStartValue();
1820 
1821   // Construct the initial value of the vector IV in the vector loop preheader
1822   auto CurrIP = Builder.saveIP();
1823   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1824   if (isa<TruncInst>(EntryVal)) {
1825     assert(Start->getType()->isIntegerTy() &&
1826            "Truncation requires an integer type");
1827     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1828     Step = Builder.CreateTrunc(Step, TruncType);
1829     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1830   }
1831   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1832   Value *SteppedStart =
1833       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1834 
1835   // We create vector phi nodes for both integer and floating-point induction
1836   // variables. Here, we determine the kind of arithmetic we will perform.
1837   Instruction::BinaryOps AddOp;
1838   Instruction::BinaryOps MulOp;
1839   if (Step->getType()->isIntegerTy()) {
1840     AddOp = Instruction::Add;
1841     MulOp = Instruction::Mul;
1842   } else {
1843     AddOp = II.getInductionOpcode();
1844     MulOp = Instruction::FMul;
1845   }
1846 
1847   // Multiply the vectorization factor by the step using integer or
1848   // floating-point arithmetic as appropriate.
1849   Value *ConstVF =
1850       getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue());
1851   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1852 
1853   // Create a vector splat to use in the induction update.
1854   //
1855   // FIXME: If the step is non-constant, we create the vector splat with
1856   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1857   //        handle a constant vector splat.
1858   assert(!VF.isScalable() && "scalable vectors not yet supported.");
1859   Value *SplatVF = isa<Constant>(Mul)
1860                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
1861                        : Builder.CreateVectorSplat(VF, Mul);
1862   Builder.restoreIP(CurrIP);
1863 
1864   // We may need to add the step a number of times, depending on the unroll
1865   // factor. The last of those goes into the PHI.
1866   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1867                                     &*LoopVectorBody->getFirstInsertionPt());
1868   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1869   Instruction *LastInduction = VecInd;
1870   for (unsigned Part = 0; Part < UF; ++Part) {
1871     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1872 
1873     if (isa<TruncInst>(EntryVal))
1874       addMetadata(LastInduction, EntryVal);
1875     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1876 
1877     LastInduction = cast<Instruction>(addFastMathFlag(
1878         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1879     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1880   }
1881 
1882   // Move the last step to the end of the latch block. This ensures consistent
1883   // placement of all induction updates.
1884   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1885   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1886   auto *ICmp = cast<Instruction>(Br->getCondition());
1887   LastInduction->moveBefore(ICmp);
1888   LastInduction->setName("vec.ind.next");
1889 
1890   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1891   VecInd->addIncoming(LastInduction, LoopVectorLatch);
1892 }
1893 
1894 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1895   return Cost->isScalarAfterVectorization(I, VF) ||
1896          Cost->isProfitableToScalarize(I, VF);
1897 }
1898 
1899 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1900   if (shouldScalarizeInstruction(IV))
1901     return true;
1902   auto isScalarInst = [&](User *U) -> bool {
1903     auto *I = cast<Instruction>(U);
1904     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1905   };
1906   return llvm::any_of(IV->users(), isScalarInst);
1907 }
1908 
1909 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1910     const InductionDescriptor &ID, const Instruction *EntryVal,
1911     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1912   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1913          "Expected either an induction phi-node or a truncate of it!");
1914 
1915   // This induction variable is not the phi from the original loop but the
1916   // newly-created IV based on the proof that casted Phi is equal to the
1917   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1918   // re-uses the same InductionDescriptor that original IV uses but we don't
1919   // have to do any recording in this case - that is done when original IV is
1920   // processed.
1921   if (isa<TruncInst>(EntryVal))
1922     return;
1923 
1924   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1925   if (Casts.empty())
1926     return;
1927   // Only the first Cast instruction in the Casts vector is of interest.
1928   // The rest of the Casts (if exist) have no uses outside the
1929   // induction update chain itself.
1930   Instruction *CastInst = *Casts.begin();
1931   if (Lane < UINT_MAX)
1932     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1933   else
1934     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1935 }
1936 
1937 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1938   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1939          "Primary induction variable must have an integer type");
1940 
1941   auto II = Legal->getInductionVars().find(IV);
1942   assert(II != Legal->getInductionVars().end() && "IV is not an induction");
1943 
1944   auto ID = II->second;
1945   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1946 
1947   // The value from the original loop to which we are mapping the new induction
1948   // variable.
1949   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1950 
1951   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1952 
1953   // Generate code for the induction step. Note that induction steps are
1954   // required to be loop-invariant
1955   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
1956     assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
1957            "Induction step should be loop invariant");
1958     if (PSE.getSE()->isSCEVable(IV->getType())) {
1959       SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1960       return Exp.expandCodeFor(Step, Step->getType(),
1961                                LoopVectorPreHeader->getTerminator());
1962     }
1963     return cast<SCEVUnknown>(Step)->getValue();
1964   };
1965 
1966   // The scalar value to broadcast. This is derived from the canonical
1967   // induction variable. If a truncation type is given, truncate the canonical
1968   // induction variable and step. Otherwise, derive these values from the
1969   // induction descriptor.
1970   auto CreateScalarIV = [&](Value *&Step) -> Value * {
1971     Value *ScalarIV = Induction;
1972     if (IV != OldInduction) {
1973       ScalarIV = IV->getType()->isIntegerTy()
1974                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1975                      : Builder.CreateCast(Instruction::SIToFP, Induction,
1976                                           IV->getType());
1977       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1978       ScalarIV->setName("offset.idx");
1979     }
1980     if (Trunc) {
1981       auto *TruncType = cast<IntegerType>(Trunc->getType());
1982       assert(Step->getType()->isIntegerTy() &&
1983              "Truncation requires an integer step");
1984       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1985       Step = Builder.CreateTrunc(Step, TruncType);
1986     }
1987     return ScalarIV;
1988   };
1989 
1990   // Create the vector values from the scalar IV, in the absence of creating a
1991   // vector IV.
1992   auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
1993     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1994     for (unsigned Part = 0; Part < UF; ++Part) {
1995       assert(!VF.isScalable() && "scalable vectors not yet supported.");
1996       Value *EntryPart =
1997           getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step,
1998                         ID.getInductionOpcode());
1999       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
2000       if (Trunc)
2001         addMetadata(EntryPart, Trunc);
2002       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
2003     }
2004   };
2005 
2006   // Now do the actual transformations, and start with creating the step value.
2007   Value *Step = CreateStepValue(ID.getStep());
2008   if (VF.isZero() || VF.isScalar()) {
2009     Value *ScalarIV = CreateScalarIV(Step);
2010     CreateSplatIV(ScalarIV, Step);
2011     return;
2012   }
2013 
2014   // Determine if we want a scalar version of the induction variable. This is
2015   // true if the induction variable itself is not widened, or if it has at
2016   // least one user in the loop that is not widened.
2017   auto NeedsScalarIV = needsScalarInduction(EntryVal);
2018   if (!NeedsScalarIV) {
2019     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
2020     return;
2021   }
2022 
2023   // Try to create a new independent vector induction variable. If we can't
2024   // create the phi node, we will splat the scalar induction variable in each
2025   // loop iteration.
2026   if (!shouldScalarizeInstruction(EntryVal)) {
2027     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
2028     Value *ScalarIV = CreateScalarIV(Step);
2029     // Create scalar steps that can be used by instructions we will later
2030     // scalarize. Note that the addition of the scalar steps will not increase
2031     // the number of instructions in the loop in the common case prior to
2032     // InstCombine. We will be trading one vector extract for each scalar step.
2033     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2034     return;
2035   }
2036 
2037   // All IV users are scalar instructions, so only emit a scalar IV, not a
2038   // vectorised IV. Except when we tail-fold, then the splat IV feeds the
2039   // predicate used by the masked loads/stores.
2040   Value *ScalarIV = CreateScalarIV(Step);
2041   if (!Cost->isScalarEpilogueAllowed())
2042     CreateSplatIV(ScalarIV, Step);
2043   buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2044 }
2045 
2046 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
2047                                           Instruction::BinaryOps BinOp) {
2048   // Create and check the types.
2049   auto *ValVTy = cast<FixedVectorType>(Val->getType());
2050   int VLen = ValVTy->getNumElements();
2051 
2052   Type *STy = Val->getType()->getScalarType();
2053   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2054          "Induction Step must be an integer or FP");
2055   assert(Step->getType() == STy && "Step has wrong type");
2056 
2057   SmallVector<Constant *, 8> Indices;
2058 
2059   if (STy->isIntegerTy()) {
2060     // Create a vector of consecutive numbers from zero to VF.
2061     for (int i = 0; i < VLen; ++i)
2062       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
2063 
2064     // Add the consecutive indices to the vector value.
2065     Constant *Cv = ConstantVector::get(Indices);
2066     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
2067     Step = Builder.CreateVectorSplat(VLen, Step);
2068     assert(Step->getType() == Val->getType() && "Invalid step vec");
2069     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2070     // which can be found from the original scalar operations.
2071     Step = Builder.CreateMul(Cv, Step);
2072     return Builder.CreateAdd(Val, Step, "induction");
2073   }
2074 
2075   // Floating point induction.
2076   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2077          "Binary Opcode should be specified for FP induction");
2078   // Create a vector of consecutive numbers from zero to VF.
2079   for (int i = 0; i < VLen; ++i)
2080     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
2081 
2082   // Add the consecutive indices to the vector value.
2083   Constant *Cv = ConstantVector::get(Indices);
2084 
2085   Step = Builder.CreateVectorSplat(VLen, Step);
2086 
2087   // Floating point operations had to be 'fast' to enable the induction.
2088   FastMathFlags Flags;
2089   Flags.setFast();
2090 
2091   Value *MulOp = Builder.CreateFMul(Cv, Step);
2092   if (isa<Instruction>(MulOp))
2093     // Have to check, MulOp may be a constant
2094     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
2095 
2096   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2097   if (isa<Instruction>(BOp))
2098     cast<Instruction>(BOp)->setFastMathFlags(Flags);
2099   return BOp;
2100 }
2101 
2102 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2103                                            Instruction *EntryVal,
2104                                            const InductionDescriptor &ID) {
2105   // We shouldn't have to build scalar steps if we aren't vectorizing.
2106   assert(VF.isVector() && "VF should be greater than one");
2107   assert(!VF.isScalable() &&
2108          "the code below assumes a fixed number of elements at compile time");
2109   // Get the value type and ensure it and the step have the same integer type.
2110   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2111   assert(ScalarIVTy == Step->getType() &&
2112          "Val and Step should have the same type");
2113 
2114   // We build scalar steps for both integer and floating-point induction
2115   // variables. Here, we determine the kind of arithmetic we will perform.
2116   Instruction::BinaryOps AddOp;
2117   Instruction::BinaryOps MulOp;
2118   if (ScalarIVTy->isIntegerTy()) {
2119     AddOp = Instruction::Add;
2120     MulOp = Instruction::Mul;
2121   } else {
2122     AddOp = ID.getInductionOpcode();
2123     MulOp = Instruction::FMul;
2124   }
2125 
2126   // Determine the number of scalars we need to generate for each unroll
2127   // iteration. If EntryVal is uniform, we only need to generate the first
2128   // lane. Otherwise, we generate all VF values.
2129   unsigned Lanes =
2130       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF)
2131           ? 1
2132           : VF.getKnownMinValue();
2133   // Compute the scalar steps and save the results in VectorLoopValueMap.
2134   for (unsigned Part = 0; Part < UF; ++Part) {
2135     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2136       auto *StartIdx = getSignedIntOrFpConstant(
2137           ScalarIVTy, VF.getKnownMinValue() * Part + Lane);
2138       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
2139       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
2140       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
2141       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
2142     }
2143   }
2144 }
2145 
2146 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
2147   assert(V != Induction && "The new induction variable should not be used.");
2148   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
2149   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2150 
2151   // If we have a stride that is replaced by one, do it here. Defer this for
2152   // the VPlan-native path until we start running Legal checks in that path.
2153   if (!EnableVPlanNativePath && Legal->hasStride(V))
2154     V = ConstantInt::get(V->getType(), 1);
2155 
2156   // If we have a vector mapped to this value, return it.
2157   if (VectorLoopValueMap.hasVectorValue(V, Part))
2158     return VectorLoopValueMap.getVectorValue(V, Part);
2159 
2160   // If the value has not been vectorized, check if it has been scalarized
2161   // instead. If it has been scalarized, and we actually need the value in
2162   // vector form, we will construct the vector values on demand.
2163   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2164     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2165 
2166     // If we've scalarized a value, that value should be an instruction.
2167     auto *I = cast<Instruction>(V);
2168 
2169     // If we aren't vectorizing, we can just copy the scalar map values over to
2170     // the vector map.
2171     if (VF.isScalar()) {
2172       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2173       return ScalarValue;
2174     }
2175 
2176     // Get the last scalar instruction we generated for V and Part. If the value
2177     // is known to be uniform after vectorization, this corresponds to lane zero
2178     // of the Part unroll iteration. Otherwise, the last instruction is the one
2179     // we created for the last vector lane of the Part unroll iteration.
2180     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2181     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF)
2182                             ? 0
2183                             : VF.getKnownMinValue() - 1;
2184     auto *LastInst = cast<Instruction>(
2185         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2186 
2187     // Set the insert point after the last scalarized instruction. This ensures
2188     // the insertelement sequence will directly follow the scalar definitions.
2189     auto OldIP = Builder.saveIP();
2190     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2191     Builder.SetInsertPoint(&*NewIP);
2192 
2193     // However, if we are vectorizing, we need to construct the vector values.
2194     // If the value is known to be uniform after vectorization, we can just
2195     // broadcast the scalar value corresponding to lane zero for each unroll
2196     // iteration. Otherwise, we construct the vector values using insertelement
2197     // instructions. Since the resulting vectors are stored in
2198     // VectorLoopValueMap, we will only generate the insertelements once.
2199     Value *VectorValue = nullptr;
2200     if (Cost->isUniformAfterVectorization(I, VF)) {
2201       VectorValue = getBroadcastInstrs(ScalarValue);
2202       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2203     } else {
2204       // Initialize packing with insertelements to start from undef.
2205       assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2206       Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2207       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2208       for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
2209         packScalarIntoVectorValue(V, {Part, Lane});
2210       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2211     }
2212     Builder.restoreIP(OldIP);
2213     return VectorValue;
2214   }
2215 
2216   // If this scalar is unknown, assume that it is a constant or that it is
2217   // loop invariant. Broadcast V and save the value for future uses.
2218   Value *B = getBroadcastInstrs(V);
2219   VectorLoopValueMap.setVectorValue(V, Part, B);
2220   return B;
2221 }
2222 
2223 Value *
2224 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2225                                             const VPIteration &Instance) {
2226   // If the value is not an instruction contained in the loop, it should
2227   // already be scalar.
2228   if (OrigLoop->isLoopInvariant(V))
2229     return V;
2230 
2231   assert(Instance.Lane > 0
2232              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2233              : true && "Uniform values only have lane zero");
2234 
2235   // If the value from the original loop has not been vectorized, it is
2236   // represented by UF x VF scalar values in the new loop. Return the requested
2237   // scalar value.
2238   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2239     return VectorLoopValueMap.getScalarValue(V, Instance);
2240 
2241   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2242   // for the given unroll part. If this entry is not a vector type (i.e., the
2243   // vectorization factor is one), there is no need to generate an
2244   // extractelement instruction.
2245   auto *U = getOrCreateVectorValue(V, Instance.Part);
2246   if (!U->getType()->isVectorTy()) {
2247     assert(VF.isScalar() && "Value not scalarized has non-vector type");
2248     return U;
2249   }
2250 
2251   // Otherwise, the value from the original loop has been vectorized and is
2252   // represented by UF vector values. Extract and return the requested scalar
2253   // value from the appropriate vector lane.
2254   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2255 }
2256 
2257 void InnerLoopVectorizer::packScalarIntoVectorValue(
2258     Value *V, const VPIteration &Instance) {
2259   assert(V != Induction && "The new induction variable should not be used.");
2260   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2261   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2262 
2263   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2264   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2265   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2266                                             Builder.getInt32(Instance.Lane));
2267   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2268 }
2269 
2270 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2271   assert(Vec->getType()->isVectorTy() && "Invalid type");
2272   assert(!VF.isScalable() && "Cannot reverse scalable vectors");
2273   SmallVector<int, 8> ShuffleMask;
2274   for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
2275     ShuffleMask.push_back(VF.getKnownMinValue() - i - 1);
2276 
2277   return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse");
2278 }
2279 
2280 // Return whether we allow using masked interleave-groups (for dealing with
2281 // strided loads/stores that reside in predicated blocks, or for dealing
2282 // with gaps).
2283 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2284   // If an override option has been passed in for interleaved accesses, use it.
2285   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2286     return EnableMaskedInterleavedMemAccesses;
2287 
2288   return TTI.enableMaskedInterleavedAccessVectorization();
2289 }
2290 
2291 // Try to vectorize the interleave group that \p Instr belongs to.
2292 //
2293 // E.g. Translate following interleaved load group (factor = 3):
2294 //   for (i = 0; i < N; i+=3) {
2295 //     R = Pic[i];             // Member of index 0
2296 //     G = Pic[i+1];           // Member of index 1
2297 //     B = Pic[i+2];           // Member of index 2
2298 //     ... // do something to R, G, B
2299 //   }
2300 // To:
2301 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2302 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2303 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2304 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2305 //
2306 // Or translate following interleaved store group (factor = 3):
2307 //   for (i = 0; i < N; i+=3) {
2308 //     ... do something to R, G, B
2309 //     Pic[i]   = R;           // Member of index 0
2310 //     Pic[i+1] = G;           // Member of index 1
2311 //     Pic[i+2] = B;           // Member of index 2
2312 //   }
2313 // To:
2314 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2315 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2316 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2317 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2318 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2319 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2320     const InterleaveGroup<Instruction> *Group, VPTransformState &State,
2321     VPValue *Addr, VPValue *BlockInMask) {
2322   Instruction *Instr = Group->getInsertPos();
2323   const DataLayout &DL = Instr->getModule()->getDataLayout();
2324 
2325   // Prepare for the vector type of the interleaved load/store.
2326   Type *ScalarTy = getMemInstValueType(Instr);
2327   unsigned InterleaveFactor = Group->getFactor();
2328   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2329   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2330 
2331   // Prepare for the new pointers.
2332   SmallVector<Value *, 2> AddrParts;
2333   unsigned Index = Group->getIndex(Instr);
2334 
2335   // TODO: extend the masked interleaved-group support to reversed access.
2336   assert((!BlockInMask || !Group->isReverse()) &&
2337          "Reversed masked interleave-group not supported.");
2338 
2339   // If the group is reverse, adjust the index to refer to the last vector lane
2340   // instead of the first. We adjust the index from the first vector lane,
2341   // rather than directly getting the pointer for lane VF - 1, because the
2342   // pointer operand of the interleaved access is supposed to be uniform. For
2343   // uniform instructions, we're only required to generate a value for the
2344   // first vector lane in each unroll iteration.
2345   assert(!VF.isScalable() &&
2346          "scalable vector reverse operation is not implemented");
2347   if (Group->isReverse())
2348     Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2349 
2350   for (unsigned Part = 0; Part < UF; Part++) {
2351     Value *AddrPart = State.get(Addr, {Part, 0});
2352     setDebugLocFromInst(Builder, AddrPart);
2353 
2354     // Notice current instruction could be any index. Need to adjust the address
2355     // to the member of index 0.
2356     //
2357     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2358     //       b = A[i];       // Member of index 0
2359     // Current pointer is pointed to A[i+1], adjust it to A[i].
2360     //
2361     // E.g.  A[i+1] = a;     // Member of index 1
2362     //       A[i]   = b;     // Member of index 0
2363     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2364     // Current pointer is pointed to A[i+2], adjust it to A[i].
2365 
2366     bool InBounds = false;
2367     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2368       InBounds = gep->isInBounds();
2369     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2370     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2371 
2372     // Cast to the vector pointer type.
2373     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2374     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2375     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2376   }
2377 
2378   setDebugLocFromInst(Builder, Instr);
2379   Value *UndefVec = UndefValue::get(VecTy);
2380 
2381   Value *MaskForGaps = nullptr;
2382   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2383     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2384     MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2385     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2386   }
2387 
2388   // Vectorize the interleaved load group.
2389   if (isa<LoadInst>(Instr)) {
2390     // For each unroll part, create a wide load for the group.
2391     SmallVector<Value *, 2> NewLoads;
2392     for (unsigned Part = 0; Part < UF; Part++) {
2393       Instruction *NewLoad;
2394       if (BlockInMask || MaskForGaps) {
2395         assert(useMaskedInterleavedAccesses(*TTI) &&
2396                "masked interleaved groups are not allowed.");
2397         Value *GroupMask = MaskForGaps;
2398         if (BlockInMask) {
2399           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2400           assert(!VF.isScalable() && "scalable vectors not yet supported.");
2401           Value *ShuffledMask = Builder.CreateShuffleVector(
2402               BlockInMaskPart,
2403               createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2404               "interleaved.mask");
2405           GroupMask = MaskForGaps
2406                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2407                                                 MaskForGaps)
2408                           : ShuffledMask;
2409         }
2410         NewLoad =
2411             Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
2412                                      GroupMask, UndefVec, "wide.masked.vec");
2413       }
2414       else
2415         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2416                                             Group->getAlign(), "wide.vec");
2417       Group->addMetadata(NewLoad);
2418       NewLoads.push_back(NewLoad);
2419     }
2420 
2421     // For each member in the group, shuffle out the appropriate data from the
2422     // wide loads.
2423     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2424       Instruction *Member = Group->getMember(I);
2425 
2426       // Skip the gaps in the group.
2427       if (!Member)
2428         continue;
2429 
2430       assert(!VF.isScalable() && "scalable vectors not yet supported.");
2431       auto StrideMask =
2432           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2433       for (unsigned Part = 0; Part < UF; Part++) {
2434         Value *StridedVec = Builder.CreateShuffleVector(
2435             NewLoads[Part], StrideMask, "strided.vec");
2436 
2437         // If this member has different type, cast the result type.
2438         if (Member->getType() != ScalarTy) {
2439           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2440           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2441           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2442         }
2443 
2444         if (Group->isReverse())
2445           StridedVec = reverseVector(StridedVec);
2446 
2447         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2448       }
2449     }
2450     return;
2451   }
2452 
2453   // The sub vector type for current instruction.
2454   assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2455   auto *SubVT = VectorType::get(ScalarTy, VF);
2456 
2457   // Vectorize the interleaved store group.
2458   for (unsigned Part = 0; Part < UF; Part++) {
2459     // Collect the stored vector from each member.
2460     SmallVector<Value *, 4> StoredVecs;
2461     for (unsigned i = 0; i < InterleaveFactor; i++) {
2462       // Interleaved store group doesn't allow a gap, so each index has a member
2463       Instruction *Member = Group->getMember(i);
2464       assert(Member && "Fail to get a member from an interleaved store group");
2465 
2466       Value *StoredVec = getOrCreateVectorValue(
2467           cast<StoreInst>(Member)->getValueOperand(), Part);
2468       if (Group->isReverse())
2469         StoredVec = reverseVector(StoredVec);
2470 
2471       // If this member has different type, cast it to a unified type.
2472 
2473       if (StoredVec->getType() != SubVT)
2474         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2475 
2476       StoredVecs.push_back(StoredVec);
2477     }
2478 
2479     // Concatenate all vectors into a wide vector.
2480     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2481 
2482     // Interleave the elements in the wide vector.
2483     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2484     Value *IVec = Builder.CreateShuffleVector(
2485         WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2486         "interleaved.vec");
2487 
2488     Instruction *NewStoreInstr;
2489     if (BlockInMask) {
2490       Value *BlockInMaskPart = State.get(BlockInMask, Part);
2491       Value *ShuffledMask = Builder.CreateShuffleVector(
2492           BlockInMaskPart,
2493           createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2494           "interleaved.mask");
2495       NewStoreInstr = Builder.CreateMaskedStore(
2496           IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
2497     }
2498     else
2499       NewStoreInstr =
2500           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2501 
2502     Group->addMetadata(NewStoreInstr);
2503   }
2504 }
2505 
2506 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2507                                                      VPTransformState &State,
2508                                                      VPValue *Addr,
2509                                                      VPValue *StoredValue,
2510                                                      VPValue *BlockInMask) {
2511   // Attempt to issue a wide load.
2512   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2513   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2514 
2515   assert((LI || SI) && "Invalid Load/Store instruction");
2516   assert((!SI || StoredValue) && "No stored value provided for widened store");
2517   assert((!LI || !StoredValue) && "Stored value provided for widened load");
2518 
2519   LoopVectorizationCostModel::InstWidening Decision =
2520       Cost->getWideningDecision(Instr, VF);
2521   assert((Decision == LoopVectorizationCostModel::CM_Widen ||
2522           Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||
2523           Decision == LoopVectorizationCostModel::CM_GatherScatter) &&
2524          "CM decision is not to widen the memory instruction");
2525 
2526   Type *ScalarDataTy = getMemInstValueType(Instr);
2527 
2528   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2529   auto *DataTy = VectorType::get(ScalarDataTy, VF);
2530   const Align Alignment = getLoadStoreAlignment(Instr);
2531 
2532   // Determine if the pointer operand of the access is either consecutive or
2533   // reverse consecutive.
2534   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2535   bool ConsecutiveStride =
2536       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2537   bool CreateGatherScatter =
2538       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2539 
2540   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2541   // gather/scatter. Otherwise Decision should have been to Scalarize.
2542   assert((ConsecutiveStride || CreateGatherScatter) &&
2543          "The instruction should be scalarized");
2544   (void)ConsecutiveStride;
2545 
2546   VectorParts BlockInMaskParts(UF);
2547   bool isMaskRequired = BlockInMask;
2548   if (isMaskRequired)
2549     for (unsigned Part = 0; Part < UF; ++Part)
2550       BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2551 
2552   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2553     // Calculate the pointer for the specific unroll-part.
2554     GetElementPtrInst *PartPtr = nullptr;
2555 
2556     bool InBounds = false;
2557     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2558       InBounds = gep->isInBounds();
2559 
2560     if (Reverse) {
2561       // If the address is consecutive but reversed, then the
2562       // wide store needs to start at the last vector element.
2563       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2564           ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue())));
2565       PartPtr->setIsInBounds(InBounds);
2566       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2567           ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue())));
2568       PartPtr->setIsInBounds(InBounds);
2569       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2570         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2571     } else {
2572       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2573           ScalarDataTy, Ptr, Builder.getInt32(Part * VF.getKnownMinValue())));
2574       PartPtr->setIsInBounds(InBounds);
2575     }
2576 
2577     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2578     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2579   };
2580 
2581   // Handle Stores:
2582   if (SI) {
2583     setDebugLocFromInst(Builder, SI);
2584 
2585     for (unsigned Part = 0; Part < UF; ++Part) {
2586       Instruction *NewSI = nullptr;
2587       Value *StoredVal = State.get(StoredValue, Part);
2588       if (CreateGatherScatter) {
2589         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2590         Value *VectorGep = State.get(Addr, Part);
2591         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2592                                             MaskPart);
2593       } else {
2594         if (Reverse) {
2595           // If we store to reverse consecutive memory locations, then we need
2596           // to reverse the order of elements in the stored value.
2597           StoredVal = reverseVector(StoredVal);
2598           // We don't want to update the value in the map as it might be used in
2599           // another expression. So don't call resetVectorValue(StoredVal).
2600         }
2601         auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2602         if (isMaskRequired)
2603           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2604                                             BlockInMaskParts[Part]);
2605         else
2606           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2607       }
2608       addMetadata(NewSI, SI);
2609     }
2610     return;
2611   }
2612 
2613   // Handle loads.
2614   assert(LI && "Must have a load instruction");
2615   setDebugLocFromInst(Builder, LI);
2616   for (unsigned Part = 0; Part < UF; ++Part) {
2617     Value *NewLI;
2618     if (CreateGatherScatter) {
2619       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2620       Value *VectorGep = State.get(Addr, Part);
2621       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2622                                          nullptr, "wide.masked.gather");
2623       addMetadata(NewLI, LI);
2624     } else {
2625       auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2626       if (isMaskRequired)
2627         NewLI = Builder.CreateMaskedLoad(
2628             VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy),
2629             "wide.masked.load");
2630       else
2631         NewLI =
2632             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2633 
2634       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2635       addMetadata(NewLI, LI);
2636       if (Reverse)
2637         NewLI = reverseVector(NewLI);
2638     }
2639     VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2640   }
2641 }
2642 
2643 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User,
2644                                                const VPIteration &Instance,
2645                                                bool IfPredicateInstr,
2646                                                VPTransformState &State) {
2647   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2648 
2649   setDebugLocFromInst(Builder, Instr);
2650 
2651   // Does this instruction return a value ?
2652   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2653 
2654   Instruction *Cloned = Instr->clone();
2655   if (!IsVoidRetTy)
2656     Cloned->setName(Instr->getName() + ".cloned");
2657 
2658   // Replace the operands of the cloned instructions with their scalar
2659   // equivalents in the new loop.
2660   for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
2661     auto *NewOp = State.get(User.getOperand(op), Instance);
2662     Cloned->setOperand(op, NewOp);
2663   }
2664   addNewMetadata(Cloned, Instr);
2665 
2666   // Place the cloned scalar in the new loop.
2667   Builder.Insert(Cloned);
2668 
2669   // Add the cloned scalar to the scalar map entry.
2670   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2671 
2672   // If we just cloned a new assumption, add it the assumption cache.
2673   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2674     if (II->getIntrinsicID() == Intrinsic::assume)
2675       AC->registerAssumption(II);
2676 
2677   // End if-block.
2678   if (IfPredicateInstr)
2679     PredicatedInstructions.push_back(Cloned);
2680 }
2681 
2682 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2683                                                       Value *End, Value *Step,
2684                                                       Instruction *DL) {
2685   BasicBlock *Header = L->getHeader();
2686   BasicBlock *Latch = L->getLoopLatch();
2687   // As we're just creating this loop, it's possible no latch exists
2688   // yet. If so, use the header as this will be a single block loop.
2689   if (!Latch)
2690     Latch = Header;
2691 
2692   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2693   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2694   setDebugLocFromInst(Builder, OldInst);
2695   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2696 
2697   Builder.SetInsertPoint(Latch->getTerminator());
2698   setDebugLocFromInst(Builder, OldInst);
2699 
2700   // Create i+1 and fill the PHINode.
2701   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2702   Induction->addIncoming(Start, L->getLoopPreheader());
2703   Induction->addIncoming(Next, Latch);
2704   // Create the compare.
2705   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2706   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2707 
2708   // Now we have two terminators. Remove the old one from the block.
2709   Latch->getTerminator()->eraseFromParent();
2710 
2711   return Induction;
2712 }
2713 
2714 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2715   if (TripCount)
2716     return TripCount;
2717 
2718   assert(L && "Create Trip Count for null loop.");
2719   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2720   // Find the loop boundaries.
2721   ScalarEvolution *SE = PSE.getSE();
2722   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2723   assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2724          "Invalid loop count");
2725 
2726   Type *IdxTy = Legal->getWidestInductionType();
2727   assert(IdxTy && "No type for induction");
2728 
2729   // The exit count might have the type of i64 while the phi is i32. This can
2730   // happen if we have an induction variable that is sign extended before the
2731   // compare. The only way that we get a backedge taken count is that the
2732   // induction variable was signed and as such will not overflow. In such a case
2733   // truncation is legal.
2734   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
2735       IdxTy->getPrimitiveSizeInBits())
2736     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2737   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2738 
2739   // Get the total trip count from the count by adding 1.
2740   const SCEV *ExitCount = SE->getAddExpr(
2741       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2742 
2743   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2744 
2745   // Expand the trip count and place the new instructions in the preheader.
2746   // Notice that the pre-header does not change, only the loop body.
2747   SCEVExpander Exp(*SE, DL, "induction");
2748 
2749   // Count holds the overall loop count (N).
2750   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2751                                 L->getLoopPreheader()->getTerminator());
2752 
2753   if (TripCount->getType()->isPointerTy())
2754     TripCount =
2755         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2756                                     L->getLoopPreheader()->getTerminator());
2757 
2758   return TripCount;
2759 }
2760 
2761 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2762   if (VectorTripCount)
2763     return VectorTripCount;
2764 
2765   Value *TC = getOrCreateTripCount(L);
2766   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2767 
2768   Type *Ty = TC->getType();
2769   // This is where we can make the step a runtime constant.
2770   assert(!VF.isScalable() && "scalable vectorization is not supported yet");
2771   Constant *Step = ConstantInt::get(Ty, VF.getKnownMinValue() * UF);
2772 
2773   // If the tail is to be folded by masking, round the number of iterations N
2774   // up to a multiple of Step instead of rounding down. This is done by first
2775   // adding Step-1 and then rounding down. Note that it's ok if this addition
2776   // overflows: the vector induction variable will eventually wrap to zero given
2777   // that it starts at zero and its Step is a power of two; the loop will then
2778   // exit, with the last early-exit vector comparison also producing all-true.
2779   if (Cost->foldTailByMasking()) {
2780     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2781            "VF*UF must be a power of 2 when folding tail by masking");
2782     TC = Builder.CreateAdd(
2783         TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
2784   }
2785 
2786   // Now we need to generate the expression for the part of the loop that the
2787   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2788   // iterations are not required for correctness, or N - Step, otherwise. Step
2789   // is equal to the vectorization factor (number of SIMD elements) times the
2790   // unroll factor (number of SIMD instructions).
2791   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2792 
2793   // If there is a non-reversed interleaved group that may speculatively access
2794   // memory out-of-bounds, we need to ensure that there will be at least one
2795   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2796   // the trip count, we set the remainder to be equal to the step. If the step
2797   // does not evenly divide the trip count, no adjustment is necessary since
2798   // there will already be scalar iterations. Note that the minimum iterations
2799   // check ensures that N >= Step.
2800   if (VF.isVector() && Cost->requiresScalarEpilogue()) {
2801     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2802     R = Builder.CreateSelect(IsZero, Step, R);
2803   }
2804 
2805   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2806 
2807   return VectorTripCount;
2808 }
2809 
2810 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2811                                                    const DataLayout &DL) {
2812   // Verify that V is a vector type with same number of elements as DstVTy.
2813   auto *DstFVTy = cast<FixedVectorType>(DstVTy);
2814   unsigned VF = DstFVTy->getNumElements();
2815   auto *SrcVecTy = cast<FixedVectorType>(V->getType());
2816   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2817   Type *SrcElemTy = SrcVecTy->getElementType();
2818   Type *DstElemTy = DstFVTy->getElementType();
2819   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2820          "Vector elements must have same size");
2821 
2822   // Do a direct cast if element types are castable.
2823   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2824     return Builder.CreateBitOrPointerCast(V, DstFVTy);
2825   }
2826   // V cannot be directly casted to desired vector type.
2827   // May happen when V is a floating point vector but DstVTy is a vector of
2828   // pointers or vice-versa. Handle this using a two-step bitcast using an
2829   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2830   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2831          "Only one type should be a pointer type");
2832   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2833          "Only one type should be a floating point type");
2834   Type *IntTy =
2835       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2836   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
2837   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2838   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
2839 }
2840 
2841 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2842                                                          BasicBlock *Bypass) {
2843   Value *Count = getOrCreateTripCount(L);
2844   // Reuse existing vector loop preheader for TC checks.
2845   // Note that new preheader block is generated for vector loop.
2846   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2847   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2848 
2849   // Generate code to check if the loop's trip count is less than VF * UF, or
2850   // equal to it in case a scalar epilogue is required; this implies that the
2851   // vector trip count is zero. This check also covers the case where adding one
2852   // to the backedge-taken count overflowed leading to an incorrect trip count
2853   // of zero. In this case we will also jump to the scalar loop.
2854   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2855                                           : ICmpInst::ICMP_ULT;
2856 
2857   // If tail is to be folded, vector loop takes care of all iterations.
2858   Value *CheckMinIters = Builder.getFalse();
2859   if (!Cost->foldTailByMasking()) {
2860     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2861     CheckMinIters = Builder.CreateICmp(
2862         P, Count,
2863         ConstantInt::get(Count->getType(), VF.getKnownMinValue() * UF),
2864         "min.iters.check");
2865   }
2866   // Create new preheader for vector loop.
2867   LoopVectorPreHeader =
2868       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2869                  "vector.ph");
2870 
2871   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2872                                DT->getNode(Bypass)->getIDom()) &&
2873          "TC check is expected to dominate Bypass");
2874 
2875   // Update dominator for Bypass & LoopExit.
2876   DT->changeImmediateDominator(Bypass, TCCheckBlock);
2877   DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
2878 
2879   ReplaceInstWithInst(
2880       TCCheckBlock->getTerminator(),
2881       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
2882   LoopBypassBlocks.push_back(TCCheckBlock);
2883 }
2884 
2885 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2886   // Reuse existing vector loop preheader for SCEV checks.
2887   // Note that new preheader block is generated for vector loop.
2888   BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
2889 
2890   // Generate the code to check that the SCEV assumptions that we made.
2891   // We want the new basic block to start at the first instruction in a
2892   // sequence of instructions that form a check.
2893   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2894                    "scev.check");
2895   Value *SCEVCheck = Exp.expandCodeForPredicate(
2896       &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
2897 
2898   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2899     if (C->isZero())
2900       return;
2901 
2902   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2903            (OptForSizeBasedOnProfile &&
2904             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
2905          "Cannot SCEV check stride or overflow when optimizing for size");
2906 
2907   SCEVCheckBlock->setName("vector.scevcheck");
2908   // Create new preheader for vector loop.
2909   LoopVectorPreHeader =
2910       SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
2911                  nullptr, "vector.ph");
2912 
2913   // Update dominator only if this is first RT check.
2914   if (LoopBypassBlocks.empty()) {
2915     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2916     DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2917   }
2918 
2919   ReplaceInstWithInst(
2920       SCEVCheckBlock->getTerminator(),
2921       BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
2922   LoopBypassBlocks.push_back(SCEVCheckBlock);
2923   AddedSafetyChecks = true;
2924 }
2925 
2926 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2927   // VPlan-native path does not do any analysis for runtime checks currently.
2928   if (EnableVPlanNativePath)
2929     return;
2930 
2931   // Reuse existing vector loop preheader for runtime memory checks.
2932   // Note that new preheader block is generated for vector loop.
2933   BasicBlock *const MemCheckBlock = L->getLoopPreheader();
2934 
2935   // Generate the code that checks in runtime if arrays overlap. We put the
2936   // checks into a separate block to make the more common case of few elements
2937   // faster.
2938   auto *LAI = Legal->getLAI();
2939   const auto &RtPtrChecking = *LAI->getRuntimePointerChecking();
2940   if (!RtPtrChecking.Need)
2941     return;
2942 
2943   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
2944     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2945            "Cannot emit memory checks when optimizing for size, unless forced "
2946            "to vectorize.");
2947     ORE->emit([&]() {
2948       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2949                                         L->getStartLoc(), L->getHeader())
2950              << "Code-size may be reduced by not forcing "
2951                 "vectorization, or by source-code modifications "
2952                 "eliminating the need for runtime checks "
2953                 "(e.g., adding 'restrict').";
2954     });
2955   }
2956 
2957   MemCheckBlock->setName("vector.memcheck");
2958   // Create new preheader for vector loop.
2959   LoopVectorPreHeader =
2960       SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
2961                  "vector.ph");
2962 
2963   auto *CondBranch = cast<BranchInst>(
2964       Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader));
2965   ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch);
2966   LoopBypassBlocks.push_back(MemCheckBlock);
2967   AddedSafetyChecks = true;
2968 
2969   // Update dominator only if this is first RT check.
2970   if (LoopBypassBlocks.empty()) {
2971     DT->changeImmediateDominator(Bypass, MemCheckBlock);
2972     DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
2973   }
2974 
2975   Instruction *FirstCheckInst;
2976   Instruction *MemRuntimeCheck;
2977   std::tie(FirstCheckInst, MemRuntimeCheck) =
2978       addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop,
2979                        RtPtrChecking.getChecks(), RtPtrChecking.getSE());
2980   assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
2981                             "claimed checks are required");
2982   CondBranch->setCondition(MemRuntimeCheck);
2983 
2984   // We currently don't use LoopVersioning for the actual loop cloning but we
2985   // still use it to add the noalias metadata.
2986   LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2987                                           PSE.getSE());
2988   LVer->prepareNoAliasMetadata();
2989 }
2990 
2991 Value *InnerLoopVectorizer::emitTransformedIndex(
2992     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2993     const InductionDescriptor &ID) const {
2994 
2995   SCEVExpander Exp(*SE, DL, "induction");
2996   auto Step = ID.getStep();
2997   auto StartValue = ID.getStartValue();
2998   assert(Index->getType() == Step->getType() &&
2999          "Index type does not match StepValue type");
3000 
3001   // Note: the IR at this point is broken. We cannot use SE to create any new
3002   // SCEV and then expand it, hoping that SCEV's simplification will give us
3003   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
3004   // lead to various SCEV crashes. So all we can do is to use builder and rely
3005   // on InstCombine for future simplifications. Here we handle some trivial
3006   // cases only.
3007   auto CreateAdd = [&B](Value *X, Value *Y) {
3008     assert(X->getType() == Y->getType() && "Types don't match!");
3009     if (auto *CX = dyn_cast<ConstantInt>(X))
3010       if (CX->isZero())
3011         return Y;
3012     if (auto *CY = dyn_cast<ConstantInt>(Y))
3013       if (CY->isZero())
3014         return X;
3015     return B.CreateAdd(X, Y);
3016   };
3017 
3018   auto CreateMul = [&B](Value *X, Value *Y) {
3019     assert(X->getType() == Y->getType() && "Types don't match!");
3020     if (auto *CX = dyn_cast<ConstantInt>(X))
3021       if (CX->isOne())
3022         return Y;
3023     if (auto *CY = dyn_cast<ConstantInt>(Y))
3024       if (CY->isOne())
3025         return X;
3026     return B.CreateMul(X, Y);
3027   };
3028 
3029   // Get a suitable insert point for SCEV expansion. For blocks in the vector
3030   // loop, choose the end of the vector loop header (=LoopVectorBody), because
3031   // the DomTree is not kept up-to-date for additional blocks generated in the
3032   // vector loop. By using the header as insertion point, we guarantee that the
3033   // expanded instructions dominate all their uses.
3034   auto GetInsertPoint = [this, &B]() {
3035     BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
3036     if (InsertBB != LoopVectorBody &&
3037         LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
3038       return LoopVectorBody->getTerminator();
3039     return &*B.GetInsertPoint();
3040   };
3041   switch (ID.getKind()) {
3042   case InductionDescriptor::IK_IntInduction: {
3043     assert(Index->getType() == StartValue->getType() &&
3044            "Index type does not match StartValue type");
3045     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
3046       return B.CreateSub(StartValue, Index);
3047     auto *Offset = CreateMul(
3048         Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
3049     return CreateAdd(StartValue, Offset);
3050   }
3051   case InductionDescriptor::IK_PtrInduction: {
3052     assert(isa<SCEVConstant>(Step) &&
3053            "Expected constant step for pointer induction");
3054     return B.CreateGEP(
3055         StartValue->getType()->getPointerElementType(), StartValue,
3056         CreateMul(Index,
3057                   Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())));
3058   }
3059   case InductionDescriptor::IK_FpInduction: {
3060     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
3061     auto InductionBinOp = ID.getInductionBinOp();
3062     assert(InductionBinOp &&
3063            (InductionBinOp->getOpcode() == Instruction::FAdd ||
3064             InductionBinOp->getOpcode() == Instruction::FSub) &&
3065            "Original bin op should be defined for FP induction");
3066 
3067     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
3068 
3069     // Floating point operations had to be 'fast' to enable the induction.
3070     FastMathFlags Flags;
3071     Flags.setFast();
3072 
3073     Value *MulExp = B.CreateFMul(StepValue, Index);
3074     if (isa<Instruction>(MulExp))
3075       // We have to check, the MulExp may be a constant.
3076       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
3077 
3078     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
3079                                "induction");
3080     if (isa<Instruction>(BOp))
3081       cast<Instruction>(BOp)->setFastMathFlags(Flags);
3082 
3083     return BOp;
3084   }
3085   case InductionDescriptor::IK_NoInduction:
3086     return nullptr;
3087   }
3088   llvm_unreachable("invalid enum");
3089 }
3090 
3091 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3092   LoopScalarBody = OrigLoop->getHeader();
3093   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3094   LoopExitBlock = OrigLoop->getExitBlock();
3095   assert(LoopExitBlock && "Must have an exit block");
3096   assert(LoopVectorPreHeader && "Invalid loop structure");
3097 
3098   LoopMiddleBlock =
3099       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3100                  LI, nullptr, Twine(Prefix) + "middle.block");
3101   LoopScalarPreHeader =
3102       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3103                  nullptr, Twine(Prefix) + "scalar.ph");
3104   // We intentionally don't let SplitBlock to update LoopInfo since
3105   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3106   // LoopVectorBody is explicitly added to the correct place few lines later.
3107   LoopVectorBody =
3108       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3109                  nullptr, nullptr, Twine(Prefix) + "vector.body");
3110 
3111   // Update dominator for loop exit.
3112   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3113 
3114   // Create and register the new vector loop.
3115   Loop *Lp = LI->AllocateLoop();
3116   Loop *ParentLoop = OrigLoop->getParentLoop();
3117 
3118   // Insert the new loop into the loop nest and register the new basic blocks
3119   // before calling any utilities such as SCEV that require valid LoopInfo.
3120   if (ParentLoop) {
3121     ParentLoop->addChildLoop(Lp);
3122   } else {
3123     LI->addTopLevelLoop(Lp);
3124   }
3125   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3126   return Lp;
3127 }
3128 
3129 void InnerLoopVectorizer::createInductionResumeValues(Loop *L,
3130                                                       Value *VectorTripCount) {
3131   assert(VectorTripCount && L && "Expected valid arguments");
3132   // We are going to resume the execution of the scalar loop.
3133   // Go over all of the induction variables that we found and fix the
3134   // PHIs that are left in the scalar version of the loop.
3135   // The starting values of PHI nodes depend on the counter of the last
3136   // iteration in the vectorized loop.
3137   // If we come from a bypass edge then we need to start from the original
3138   // start value.
3139   for (auto &InductionEntry : Legal->getInductionVars()) {
3140     PHINode *OrigPhi = InductionEntry.first;
3141     InductionDescriptor II = InductionEntry.second;
3142 
3143     // Create phi nodes to merge from the  backedge-taken check block.
3144     PHINode *BCResumeVal =
3145         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3146                         LoopScalarPreHeader->getTerminator());
3147     // Copy original phi DL over to the new one.
3148     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3149     Value *&EndValue = IVEndValues[OrigPhi];
3150     if (OrigPhi == OldInduction) {
3151       // We know what the end value is.
3152       EndValue = VectorTripCount;
3153     } else {
3154       IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3155       Type *StepType = II.getStep()->getType();
3156       Instruction::CastOps CastOp =
3157           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3158       Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3159       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3160       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3161       EndValue->setName("ind.end");
3162     }
3163 
3164     // The new PHI merges the original incoming value, in case of a bypass,
3165     // or the value at the end of the vectorized loop.
3166     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3167 
3168     // Fix the scalar body counter (PHI node).
3169     // The old induction's phi node in the scalar body needs the truncated
3170     // value.
3171     for (BasicBlock *BB : LoopBypassBlocks)
3172       BCResumeVal->addIncoming(II.getStartValue(), BB);
3173     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3174   }
3175 }
3176 
3177 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3178                                                       MDNode *OrigLoopID) {
3179   assert(L && "Expected valid loop.");
3180 
3181   // The trip counts should be cached by now.
3182   Value *Count = getOrCreateTripCount(L);
3183   Value *VectorTripCount = getOrCreateVectorTripCount(L);
3184 
3185   // We need the OrigLoop (scalar loop part) latch terminator to help
3186   // produce correct debug info for the middle block BB instructions.
3187   // The legality check stage guarantees that the loop will have a single
3188   // latch.
3189   assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
3190          "Scalar loop latch terminator isn't a branch");
3191   BranchInst *ScalarLatchBr =
3192       cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3193 
3194   // Add a check in the middle block to see if we have completed
3195   // all of the iterations in the first vector loop.
3196   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3197   // If tail is to be folded, we know we don't need to run the remainder.
3198   Value *CmpN = Builder.getTrue();
3199   if (!Cost->foldTailByMasking()) {
3200     CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3201                            VectorTripCount, "cmp.n",
3202                            LoopMiddleBlock->getTerminator());
3203 
3204     // Here we use the same DebugLoc as the scalar loop latch branch instead
3205     // of the corresponding compare because they may have ended up with
3206     // different line numbers and we want to avoid awkward line stepping while
3207     // debugging. Eg. if the compare has got a line number inside the loop.
3208     cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3209   }
3210 
3211   BranchInst *BrInst =
3212       BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN);
3213   BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3214   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3215 
3216   // Get ready to start creating new instructions into the vectorized body.
3217   assert(LoopVectorPreHeader == L->getLoopPreheader() &&
3218          "Inconsistent vector loop preheader");
3219   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3220 
3221   Optional<MDNode *> VectorizedLoopID =
3222       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3223                                       LLVMLoopVectorizeFollowupVectorized});
3224   if (VectorizedLoopID.hasValue()) {
3225     L->setLoopID(VectorizedLoopID.getValue());
3226 
3227     // Do not setAlreadyVectorized if loop attributes have been defined
3228     // explicitly.
3229     return LoopVectorPreHeader;
3230   }
3231 
3232   // Keep all loop hints from the original loop on the vector loop (we'll
3233   // replace the vectorizer-specific hints below).
3234   if (MDNode *LID = OrigLoop->getLoopID())
3235     L->setLoopID(LID);
3236 
3237   LoopVectorizeHints Hints(L, true, *ORE);
3238   Hints.setAlreadyVectorized();
3239 
3240 #ifdef EXPENSIVE_CHECKS
3241   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3242   LI->verify(*DT);
3243 #endif
3244 
3245   return LoopVectorPreHeader;
3246 }
3247 
3248 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3249   /*
3250    In this function we generate a new loop. The new loop will contain
3251    the vectorized instructions while the old loop will continue to run the
3252    scalar remainder.
3253 
3254        [ ] <-- loop iteration number check.
3255     /   |
3256    /    v
3257   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3258   |  /  |
3259   | /   v
3260   ||   [ ]     <-- vector pre header.
3261   |/    |
3262   |     v
3263   |    [  ] \
3264   |    [  ]_|   <-- vector loop.
3265   |     |
3266   |     v
3267   |   -[ ]   <--- middle-block.
3268   |  /  |
3269   | /   v
3270   -|- >[ ]     <--- new preheader.
3271    |    |
3272    |    v
3273    |   [ ] \
3274    |   [ ]_|   <-- old scalar loop to handle remainder.
3275     \   |
3276      \  v
3277       >[ ]     <-- exit block.
3278    ...
3279    */
3280 
3281   // Get the metadata of the original loop before it gets modified.
3282   MDNode *OrigLoopID = OrigLoop->getLoopID();
3283 
3284   // Create an empty vector loop, and prepare basic blocks for the runtime
3285   // checks.
3286   Loop *Lp = createVectorLoopSkeleton("");
3287 
3288   // Now, compare the new count to zero. If it is zero skip the vector loop and
3289   // jump to the scalar loop. This check also covers the case where the
3290   // backedge-taken count is uint##_max: adding one to it will overflow leading
3291   // to an incorrect trip count of zero. In this (rare) case we will also jump
3292   // to the scalar loop.
3293   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3294 
3295   // Generate the code to check any assumptions that we've made for SCEV
3296   // expressions.
3297   emitSCEVChecks(Lp, LoopScalarPreHeader);
3298 
3299   // Generate the code that checks in runtime if arrays overlap. We put the
3300   // checks into a separate block to make the more common case of few elements
3301   // faster.
3302   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3303 
3304   // Some loops have a single integer induction variable, while other loops
3305   // don't. One example is c++ iterators that often have multiple pointer
3306   // induction variables. In the code below we also support a case where we
3307   // don't have a single induction variable.
3308   //
3309   // We try to obtain an induction variable from the original loop as hard
3310   // as possible. However if we don't find one that:
3311   //   - is an integer
3312   //   - counts from zero, stepping by one
3313   //   - is the size of the widest induction variable type
3314   // then we create a new one.
3315   OldInduction = Legal->getPrimaryInduction();
3316   Type *IdxTy = Legal->getWidestInductionType();
3317   Value *StartIdx = ConstantInt::get(IdxTy, 0);
3318   // The loop step is equal to the vectorization factor (num of SIMD elements)
3319   // times the unroll factor (num of SIMD instructions).
3320   assert(!VF.isScalable() && "scalable vectors not yet supported.");
3321   Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
3322   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3323   Induction =
3324       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3325                               getDebugLocFromInstOrOperands(OldInduction));
3326 
3327   // Emit phis for the new starting index of the scalar loop.
3328   createInductionResumeValues(Lp, CountRoundDown);
3329 
3330   return completeLoopSkeleton(Lp, OrigLoopID);
3331 }
3332 
3333 // Fix up external users of the induction variable. At this point, we are
3334 // in LCSSA form, with all external PHIs that use the IV having one input value,
3335 // coming from the remainder loop. We need those PHIs to also have a correct
3336 // value for the IV when arriving directly from the middle block.
3337 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3338                                        const InductionDescriptor &II,
3339                                        Value *CountRoundDown, Value *EndValue,
3340                                        BasicBlock *MiddleBlock) {
3341   // There are two kinds of external IV usages - those that use the value
3342   // computed in the last iteration (the PHI) and those that use the penultimate
3343   // value (the value that feeds into the phi from the loop latch).
3344   // We allow both, but they, obviously, have different values.
3345 
3346   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3347 
3348   DenseMap<Value *, Value *> MissingVals;
3349 
3350   // An external user of the last iteration's value should see the value that
3351   // the remainder loop uses to initialize its own IV.
3352   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3353   for (User *U : PostInc->users()) {
3354     Instruction *UI = cast<Instruction>(U);
3355     if (!OrigLoop->contains(UI)) {
3356       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3357       MissingVals[UI] = EndValue;
3358     }
3359   }
3360 
3361   // An external user of the penultimate value need to see EndValue - Step.
3362   // The simplest way to get this is to recompute it from the constituent SCEVs,
3363   // that is Start + (Step * (CRD - 1)).
3364   for (User *U : OrigPhi->users()) {
3365     auto *UI = cast<Instruction>(U);
3366     if (!OrigLoop->contains(UI)) {
3367       const DataLayout &DL =
3368           OrigLoop->getHeader()->getModule()->getDataLayout();
3369       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3370 
3371       IRBuilder<> B(MiddleBlock->getTerminator());
3372       Value *CountMinusOne = B.CreateSub(
3373           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3374       Value *CMO =
3375           !II.getStep()->getType()->isIntegerTy()
3376               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3377                              II.getStep()->getType())
3378               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3379       CMO->setName("cast.cmo");
3380       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3381       Escape->setName("ind.escape");
3382       MissingVals[UI] = Escape;
3383     }
3384   }
3385 
3386   for (auto &I : MissingVals) {
3387     PHINode *PHI = cast<PHINode>(I.first);
3388     // One corner case we have to handle is two IVs "chasing" each-other,
3389     // that is %IV2 = phi [...], [ %IV1, %latch ]
3390     // In this case, if IV1 has an external use, we need to avoid adding both
3391     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3392     // don't already have an incoming value for the middle block.
3393     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3394       PHI->addIncoming(I.second, MiddleBlock);
3395   }
3396 }
3397 
3398 namespace {
3399 
3400 struct CSEDenseMapInfo {
3401   static bool canHandle(const Instruction *I) {
3402     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3403            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3404   }
3405 
3406   static inline Instruction *getEmptyKey() {
3407     return DenseMapInfo<Instruction *>::getEmptyKey();
3408   }
3409 
3410   static inline Instruction *getTombstoneKey() {
3411     return DenseMapInfo<Instruction *>::getTombstoneKey();
3412   }
3413 
3414   static unsigned getHashValue(const Instruction *I) {
3415     assert(canHandle(I) && "Unknown instruction!");
3416     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3417                                                            I->value_op_end()));
3418   }
3419 
3420   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3421     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3422         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3423       return LHS == RHS;
3424     return LHS->isIdenticalTo(RHS);
3425   }
3426 };
3427 
3428 } // end anonymous namespace
3429 
3430 ///Perform cse of induction variable instructions.
3431 static void cse(BasicBlock *BB) {
3432   // Perform simple cse.
3433   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3434   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3435     Instruction *In = &*I++;
3436 
3437     if (!CSEDenseMapInfo::canHandle(In))
3438       continue;
3439 
3440     // Check if we can replace this instruction with any of the
3441     // visited instructions.
3442     if (Instruction *V = CSEMap.lookup(In)) {
3443       In->replaceAllUsesWith(V);
3444       In->eraseFromParent();
3445       continue;
3446     }
3447 
3448     CSEMap[In] = In;
3449   }
3450 }
3451 
3452 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3453                                                        ElementCount VF,
3454                                                        bool &NeedToScalarize) {
3455   assert(!VF.isScalable() && "scalable vectors not yet supported.");
3456   Function *F = CI->getCalledFunction();
3457   Type *ScalarRetTy = CI->getType();
3458   SmallVector<Type *, 4> Tys, ScalarTys;
3459   for (auto &ArgOp : CI->arg_operands())
3460     ScalarTys.push_back(ArgOp->getType());
3461 
3462   // Estimate cost of scalarized vector call. The source operands are assumed
3463   // to be vectors, so we need to extract individual elements from there,
3464   // execute VF scalar calls, and then gather the result into the vector return
3465   // value.
3466   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys,
3467                                                  TTI::TCK_RecipThroughput);
3468   if (VF.isScalar())
3469     return ScalarCallCost;
3470 
3471   // Compute corresponding vector type for return value and arguments.
3472   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3473   for (Type *ScalarTy : ScalarTys)
3474     Tys.push_back(ToVectorTy(ScalarTy, VF));
3475 
3476   // Compute costs of unpacking argument values for the scalar calls and
3477   // packing the return values to a vector.
3478   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3479 
3480   unsigned Cost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3481 
3482   // If we can't emit a vector call for this function, then the currently found
3483   // cost is the cost we need to return.
3484   NeedToScalarize = true;
3485   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3486   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3487 
3488   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3489     return Cost;
3490 
3491   // If the corresponding vector cost is cheaper, return its cost.
3492   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys,
3493                                                  TTI::TCK_RecipThroughput);
3494   if (VectorCallCost < Cost) {
3495     NeedToScalarize = false;
3496     return VectorCallCost;
3497   }
3498   return Cost;
3499 }
3500 
3501 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3502                                                             ElementCount VF) {
3503   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3504   assert(ID && "Expected intrinsic call!");
3505 
3506   IntrinsicCostAttributes CostAttrs(ID, *CI, VF);
3507   return TTI.getIntrinsicInstrCost(CostAttrs,
3508                                    TargetTransformInfo::TCK_RecipThroughput);
3509 }
3510 
3511 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3512   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3513   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3514   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3515 }
3516 
3517 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3518   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3519   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3520   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3521 }
3522 
3523 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3524   // For every instruction `I` in MinBWs, truncate the operands, create a
3525   // truncated version of `I` and reextend its result. InstCombine runs
3526   // later and will remove any ext/trunc pairs.
3527   SmallPtrSet<Value *, 4> Erased;
3528   for (const auto &KV : Cost->getMinimalBitwidths()) {
3529     // If the value wasn't vectorized, we must maintain the original scalar
3530     // type. The absence of the value from VectorLoopValueMap indicates that it
3531     // wasn't vectorized.
3532     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3533       continue;
3534     for (unsigned Part = 0; Part < UF; ++Part) {
3535       Value *I = getOrCreateVectorValue(KV.first, Part);
3536       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3537         continue;
3538       Type *OriginalTy = I->getType();
3539       Type *ScalarTruncatedTy =
3540           IntegerType::get(OriginalTy->getContext(), KV.second);
3541       auto *TruncatedTy = FixedVectorType::get(
3542           ScalarTruncatedTy,
3543           cast<FixedVectorType>(OriginalTy)->getNumElements());
3544       if (TruncatedTy == OriginalTy)
3545         continue;
3546 
3547       IRBuilder<> B(cast<Instruction>(I));
3548       auto ShrinkOperand = [&](Value *V) -> Value * {
3549         if (auto *ZI = dyn_cast<ZExtInst>(V))
3550           if (ZI->getSrcTy() == TruncatedTy)
3551             return ZI->getOperand(0);
3552         return B.CreateZExtOrTrunc(V, TruncatedTy);
3553       };
3554 
3555       // The actual instruction modification depends on the instruction type,
3556       // unfortunately.
3557       Value *NewI = nullptr;
3558       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3559         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3560                              ShrinkOperand(BO->getOperand(1)));
3561 
3562         // Any wrapping introduced by shrinking this operation shouldn't be
3563         // considered undefined behavior. So, we can't unconditionally copy
3564         // arithmetic wrapping flags to NewI.
3565         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3566       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3567         NewI =
3568             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3569                          ShrinkOperand(CI->getOperand(1)));
3570       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3571         NewI = B.CreateSelect(SI->getCondition(),
3572                               ShrinkOperand(SI->getTrueValue()),
3573                               ShrinkOperand(SI->getFalseValue()));
3574       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3575         switch (CI->getOpcode()) {
3576         default:
3577           llvm_unreachable("Unhandled cast!");
3578         case Instruction::Trunc:
3579           NewI = ShrinkOperand(CI->getOperand(0));
3580           break;
3581         case Instruction::SExt:
3582           NewI = B.CreateSExtOrTrunc(
3583               CI->getOperand(0),
3584               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3585           break;
3586         case Instruction::ZExt:
3587           NewI = B.CreateZExtOrTrunc(
3588               CI->getOperand(0),
3589               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3590           break;
3591         }
3592       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3593         auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType())
3594                              ->getNumElements();
3595         auto *O0 = B.CreateZExtOrTrunc(
3596             SI->getOperand(0),
3597             FixedVectorType::get(ScalarTruncatedTy, Elements0));
3598         auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType())
3599                              ->getNumElements();
3600         auto *O1 = B.CreateZExtOrTrunc(
3601             SI->getOperand(1),
3602             FixedVectorType::get(ScalarTruncatedTy, Elements1));
3603 
3604         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3605       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3606         // Don't do anything with the operands, just extend the result.
3607         continue;
3608       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3609         auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType())
3610                             ->getNumElements();
3611         auto *O0 = B.CreateZExtOrTrunc(
3612             IE->getOperand(0),
3613             FixedVectorType::get(ScalarTruncatedTy, Elements));
3614         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3615         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3616       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3617         auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType())
3618                             ->getNumElements();
3619         auto *O0 = B.CreateZExtOrTrunc(
3620             EE->getOperand(0),
3621             FixedVectorType::get(ScalarTruncatedTy, Elements));
3622         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3623       } else {
3624         // If we don't know what to do, be conservative and don't do anything.
3625         continue;
3626       }
3627 
3628       // Lastly, extend the result.
3629       NewI->takeName(cast<Instruction>(I));
3630       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3631       I->replaceAllUsesWith(Res);
3632       cast<Instruction>(I)->eraseFromParent();
3633       Erased.insert(I);
3634       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3635     }
3636   }
3637 
3638   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3639   for (const auto &KV : Cost->getMinimalBitwidths()) {
3640     // If the value wasn't vectorized, we must maintain the original scalar
3641     // type. The absence of the value from VectorLoopValueMap indicates that it
3642     // wasn't vectorized.
3643     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3644       continue;
3645     for (unsigned Part = 0; Part < UF; ++Part) {
3646       Value *I = getOrCreateVectorValue(KV.first, Part);
3647       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3648       if (Inst && Inst->use_empty()) {
3649         Value *NewI = Inst->getOperand(0);
3650         Inst->eraseFromParent();
3651         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3652       }
3653     }
3654   }
3655 }
3656 
3657 void InnerLoopVectorizer::fixVectorizedLoop() {
3658   // Insert truncates and extends for any truncated instructions as hints to
3659   // InstCombine.
3660   if (VF.isVector())
3661     truncateToMinimalBitwidths();
3662 
3663   // Fix widened non-induction PHIs by setting up the PHI operands.
3664   if (OrigPHIsToFix.size()) {
3665     assert(EnableVPlanNativePath &&
3666            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3667     fixNonInductionPHIs();
3668   }
3669 
3670   // At this point every instruction in the original loop is widened to a
3671   // vector form. Now we need to fix the recurrences in the loop. These PHI
3672   // nodes are currently empty because we did not want to introduce cycles.
3673   // This is the second stage of vectorizing recurrences.
3674   fixCrossIterationPHIs();
3675 
3676   // Forget the original basic block.
3677   PSE.getSE()->forgetLoop(OrigLoop);
3678 
3679   // Fix-up external users of the induction variables.
3680   for (auto &Entry : Legal->getInductionVars())
3681     fixupIVUsers(Entry.first, Entry.second,
3682                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3683                  IVEndValues[Entry.first], LoopMiddleBlock);
3684 
3685   fixLCSSAPHIs();
3686   for (Instruction *PI : PredicatedInstructions)
3687     sinkScalarOperands(&*PI);
3688 
3689   // Remove redundant induction instructions.
3690   cse(LoopVectorBody);
3691 
3692   // Set/update profile weights for the vector and remainder loops as original
3693   // loop iterations are now distributed among them. Note that original loop
3694   // represented by LoopScalarBody becomes remainder loop after vectorization.
3695   //
3696   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3697   // end up getting slightly roughened result but that should be OK since
3698   // profile is not inherently precise anyway. Note also possible bypass of
3699   // vector code caused by legality checks is ignored, assigning all the weight
3700   // to the vector loop, optimistically.
3701   assert(!VF.isScalable() &&
3702          "cannot use scalable ElementCount to determine unroll factor");
3703   setProfileInfoAfterUnrolling(
3704       LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
3705       LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
3706 }
3707 
3708 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3709   // In order to support recurrences we need to be able to vectorize Phi nodes.
3710   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3711   // stage #2: We now need to fix the recurrences by adding incoming edges to
3712   // the currently empty PHI nodes. At this point every instruction in the
3713   // original loop is widened to a vector form so we can use them to construct
3714   // the incoming edges.
3715   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3716     // Handle first-order recurrences and reductions that need to be fixed.
3717     if (Legal->isFirstOrderRecurrence(&Phi))
3718       fixFirstOrderRecurrence(&Phi);
3719     else if (Legal->isReductionVariable(&Phi))
3720       fixReduction(&Phi);
3721   }
3722 }
3723 
3724 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3725   // This is the second phase of vectorizing first-order recurrences. An
3726   // overview of the transformation is described below. Suppose we have the
3727   // following loop.
3728   //
3729   //   for (int i = 0; i < n; ++i)
3730   //     b[i] = a[i] - a[i - 1];
3731   //
3732   // There is a first-order recurrence on "a". For this loop, the shorthand
3733   // scalar IR looks like:
3734   //
3735   //   scalar.ph:
3736   //     s_init = a[-1]
3737   //     br scalar.body
3738   //
3739   //   scalar.body:
3740   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3741   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3742   //     s2 = a[i]
3743   //     b[i] = s2 - s1
3744   //     br cond, scalar.body, ...
3745   //
3746   // In this example, s1 is a recurrence because it's value depends on the
3747   // previous iteration. In the first phase of vectorization, we created a
3748   // temporary value for s1. We now complete the vectorization and produce the
3749   // shorthand vector IR shown below (for VF = 4, UF = 1).
3750   //
3751   //   vector.ph:
3752   //     v_init = vector(..., ..., ..., a[-1])
3753   //     br vector.body
3754   //
3755   //   vector.body
3756   //     i = phi [0, vector.ph], [i+4, vector.body]
3757   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3758   //     v2 = a[i, i+1, i+2, i+3];
3759   //     v3 = vector(v1(3), v2(0, 1, 2))
3760   //     b[i, i+1, i+2, i+3] = v2 - v3
3761   //     br cond, vector.body, middle.block
3762   //
3763   //   middle.block:
3764   //     x = v2(3)
3765   //     br scalar.ph
3766   //
3767   //   scalar.ph:
3768   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3769   //     br scalar.body
3770   //
3771   // After execution completes the vector loop, we extract the next value of
3772   // the recurrence (x) to use as the initial value in the scalar loop.
3773 
3774   // Get the original loop preheader and single loop latch.
3775   auto *Preheader = OrigLoop->getLoopPreheader();
3776   auto *Latch = OrigLoop->getLoopLatch();
3777 
3778   // Get the initial and previous values of the scalar recurrence.
3779   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3780   auto *Previous = Phi->getIncomingValueForBlock(Latch);
3781 
3782   // Create a vector from the initial value.
3783   auto *VectorInit = ScalarInit;
3784   if (VF.isVector()) {
3785     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3786     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
3787     VectorInit = Builder.CreateInsertElement(
3788         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3789         Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init");
3790   }
3791 
3792   // We constructed a temporary phi node in the first phase of vectorization.
3793   // This phi node will eventually be deleted.
3794   Builder.SetInsertPoint(
3795       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3796 
3797   // Create a phi node for the new recurrence. The current value will either be
3798   // the initial value inserted into a vector or loop-varying vector value.
3799   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3800   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3801 
3802   // Get the vectorized previous value of the last part UF - 1. It appears last
3803   // among all unrolled iterations, due to the order of their construction.
3804   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3805 
3806   // Find and set the insertion point after the previous value if it is an
3807   // instruction.
3808   BasicBlock::iterator InsertPt;
3809   // Note that the previous value may have been constant-folded so it is not
3810   // guaranteed to be an instruction in the vector loop.
3811   // FIXME: Loop invariant values do not form recurrences. We should deal with
3812   //        them earlier.
3813   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
3814     InsertPt = LoopVectorBody->getFirstInsertionPt();
3815   else {
3816     Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
3817     if (isa<PHINode>(PreviousLastPart))
3818       // If the previous value is a phi node, we should insert after all the phi
3819       // nodes in the block containing the PHI to avoid breaking basic block
3820       // verification. Note that the basic block may be different to
3821       // LoopVectorBody, in case we predicate the loop.
3822       InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
3823     else
3824       InsertPt = ++PreviousInst->getIterator();
3825   }
3826   Builder.SetInsertPoint(&*InsertPt);
3827 
3828   // We will construct a vector for the recurrence by combining the values for
3829   // the current and previous iterations. This is the required shuffle mask.
3830   assert(!VF.isScalable());
3831   SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue());
3832   ShuffleMask[0] = VF.getKnownMinValue() - 1;
3833   for (unsigned I = 1; I < VF.getKnownMinValue(); ++I)
3834     ShuffleMask[I] = I + VF.getKnownMinValue() - 1;
3835 
3836   // The vector from which to take the initial value for the current iteration
3837   // (actual or unrolled). Initially, this is the vector phi node.
3838   Value *Incoming = VecPhi;
3839 
3840   // Shuffle the current and previous vector and update the vector parts.
3841   for (unsigned Part = 0; Part < UF; ++Part) {
3842     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3843     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3844     auto *Shuffle =
3845         VF.isVector()
3846             ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask)
3847             : Incoming;
3848     PhiPart->replaceAllUsesWith(Shuffle);
3849     cast<Instruction>(PhiPart)->eraseFromParent();
3850     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3851     Incoming = PreviousPart;
3852   }
3853 
3854   // Fix the latch value of the new recurrence in the vector loop.
3855   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3856 
3857   // Extract the last vector element in the middle block. This will be the
3858   // initial value for the recurrence when jumping to the scalar loop.
3859   auto *ExtractForScalar = Incoming;
3860   if (VF.isVector()) {
3861     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3862     ExtractForScalar = Builder.CreateExtractElement(
3863         ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1),
3864         "vector.recur.extract");
3865   }
3866   // Extract the second last element in the middle block if the
3867   // Phi is used outside the loop. We need to extract the phi itself
3868   // and not the last element (the phi update in the current iteration). This
3869   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3870   // when the scalar loop is not run at all.
3871   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3872   if (VF.isVector())
3873     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3874         Incoming, Builder.getInt32(VF.getKnownMinValue() - 2),
3875         "vector.recur.extract.for.phi");
3876   // When loop is unrolled without vectorizing, initialize
3877   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3878   // `Incoming`. This is analogous to the vectorized case above: extracting the
3879   // second last element when VF > 1.
3880   else if (UF > 1)
3881     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3882 
3883   // Fix the initial value of the original recurrence in the scalar loop.
3884   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3885   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3886   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3887     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3888     Start->addIncoming(Incoming, BB);
3889   }
3890 
3891   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3892   Phi->setName("scalar.recur");
3893 
3894   // Finally, fix users of the recurrence outside the loop. The users will need
3895   // either the last value of the scalar recurrence or the last value of the
3896   // vector recurrence we extracted in the middle block. Since the loop is in
3897   // LCSSA form, we just need to find all the phi nodes for the original scalar
3898   // recurrence in the exit block, and then add an edge for the middle block.
3899   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3900     if (LCSSAPhi.getIncomingValue(0) == Phi) {
3901       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3902     }
3903   }
3904 }
3905 
3906 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3907   Constant *Zero = Builder.getInt32(0);
3908 
3909   // Get it's reduction variable descriptor.
3910   assert(Legal->isReductionVariable(Phi) &&
3911          "Unable to find the reduction variable");
3912   RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
3913 
3914   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3915   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3916   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3917   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3918     RdxDesc.getMinMaxRecurrenceKind();
3919   setDebugLocFromInst(Builder, ReductionStartValue);
3920   bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi);
3921 
3922   // We need to generate a reduction vector from the incoming scalar.
3923   // To do so, we need to generate the 'identity' vector and override
3924   // one of the elements with the incoming scalar reduction. We need
3925   // to do it in the vector-loop preheader.
3926   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3927 
3928   // This is the vector-clone of the value that leaves the loop.
3929   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3930 
3931   // Find the reduction identity variable. Zero for addition, or, xor,
3932   // one for multiplication, -1 for And.
3933   Value *Identity;
3934   Value *VectorStart;
3935   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3936       RK == RecurrenceDescriptor::RK_FloatMinMax) {
3937     // MinMax reduction have the start value as their identify.
3938     if (VF.isScalar() || IsInLoopReductionPhi) {
3939       VectorStart = Identity = ReductionStartValue;
3940     } else {
3941       VectorStart = Identity =
3942         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3943     }
3944   } else {
3945     // Handle other reduction kinds:
3946     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3947         RK, MinMaxKind, VecTy->getScalarType());
3948     if (VF.isScalar() || IsInLoopReductionPhi) {
3949       Identity = Iden;
3950       // This vector is the Identity vector where the first element is the
3951       // incoming scalar reduction.
3952       VectorStart = ReductionStartValue;
3953     } else {
3954       Identity = ConstantVector::getSplat(VF, Iden);
3955 
3956       // This vector is the Identity vector where the first element is the
3957       // incoming scalar reduction.
3958       VectorStart =
3959         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3960     }
3961   }
3962 
3963   // Wrap flags are in general invalid after vectorization, clear them.
3964   clearReductionWrapFlags(RdxDesc);
3965 
3966   // Fix the vector-loop phi.
3967 
3968   // Reductions do not have to start at zero. They can start with
3969   // any loop invariant values.
3970   BasicBlock *Latch = OrigLoop->getLoopLatch();
3971   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3972 
3973   for (unsigned Part = 0; Part < UF; ++Part) {
3974     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3975     Value *Val = getOrCreateVectorValue(LoopVal, Part);
3976     // Make sure to add the reduction start value only to the
3977     // first unroll part.
3978     Value *StartVal = (Part == 0) ? VectorStart : Identity;
3979     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3980     cast<PHINode>(VecRdxPhi)
3981       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3982   }
3983 
3984   // Before each round, move the insertion point right between
3985   // the PHIs and the values we are going to write.
3986   // This allows us to write both PHINodes and the extractelement
3987   // instructions.
3988   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3989 
3990   setDebugLocFromInst(Builder, LoopExitInst);
3991 
3992   // If tail is folded by masking, the vector value to leave the loop should be
3993   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3994   // instead of the former. For an inloop reduction the reduction will already
3995   // be predicated, and does not need to be handled here.
3996   if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) {
3997     for (unsigned Part = 0; Part < UF; ++Part) {
3998       Value *VecLoopExitInst =
3999           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4000       Value *Sel = nullptr;
4001       for (User *U : VecLoopExitInst->users()) {
4002         if (isa<SelectInst>(U)) {
4003           assert(!Sel && "Reduction exit feeding two selects");
4004           Sel = U;
4005         } else
4006           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
4007       }
4008       assert(Sel && "Reduction exit feeds no select");
4009       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
4010 
4011       // If the target can create a predicated operator for the reduction at no
4012       // extra cost in the loop (for example a predicated vadd), it can be
4013       // cheaper for the select to remain in the loop than be sunk out of it,
4014       // and so use the select value for the phi instead of the old
4015       // LoopExitValue.
4016       RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
4017       if (PreferPredicatedReductionSelect ||
4018           TTI->preferPredicatedReductionSelect(
4019               RdxDesc.getRecurrenceBinOp(RdxDesc.getRecurrenceKind()),
4020               Phi->getType(), TargetTransformInfo::ReductionFlags())) {
4021         auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part));
4022         VecRdxPhi->setIncomingValueForBlock(
4023             LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
4024       }
4025     }
4026   }
4027 
4028   // If the vector reduction can be performed in a smaller type, we truncate
4029   // then extend the loop exit value to enable InstCombine to evaluate the
4030   // entire expression in the smaller type.
4031   if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) {
4032     assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!");
4033     assert(!VF.isScalable() && "scalable vectors not yet supported.");
4034     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4035     Builder.SetInsertPoint(
4036         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
4037     VectorParts RdxParts(UF);
4038     for (unsigned Part = 0; Part < UF; ++Part) {
4039       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4040       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4041       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4042                                         : Builder.CreateZExt(Trunc, VecTy);
4043       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
4044            UI != RdxParts[Part]->user_end();)
4045         if (*UI != Trunc) {
4046           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
4047           RdxParts[Part] = Extnd;
4048         } else {
4049           ++UI;
4050         }
4051     }
4052     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4053     for (unsigned Part = 0; Part < UF; ++Part) {
4054       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4055       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
4056     }
4057   }
4058 
4059   // Reduce all of the unrolled parts into a single vector.
4060   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
4061   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
4062 
4063   // The middle block terminator has already been assigned a DebugLoc here (the
4064   // OrigLoop's single latch terminator). We want the whole middle block to
4065   // appear to execute on this line because: (a) it is all compiler generated,
4066   // (b) these instructions are always executed after evaluating the latch
4067   // conditional branch, and (c) other passes may add new predecessors which
4068   // terminate on this line. This is the easiest way to ensure we don't
4069   // accidentally cause an extra step back into the loop while debugging.
4070   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
4071   for (unsigned Part = 1; Part < UF; ++Part) {
4072     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4073     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
4074       // Floating point operations had to be 'fast' to enable the reduction.
4075       ReducedPartRdx = addFastMathFlag(
4076           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
4077                               ReducedPartRdx, "bin.rdx"),
4078           RdxDesc.getFastMathFlags());
4079     else
4080       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
4081                                       RdxPart);
4082   }
4083 
4084   // Create the reduction after the loop. Note that inloop reductions create the
4085   // target reduction in the loop using a Reduction recipe.
4086   if (VF.isVector() && !IsInLoopReductionPhi) {
4087     bool NoNaN = Legal->hasFunNoNaNAttr();
4088     ReducedPartRdx =
4089         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
4090     // If the reduction can be performed in a smaller type, we need to extend
4091     // the reduction to the wider type before we branch to the original loop.
4092     if (Phi->getType() != RdxDesc.getRecurrenceType())
4093       ReducedPartRdx =
4094         RdxDesc.isSigned()
4095         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
4096         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
4097   }
4098 
4099   // Create a phi node that merges control-flow from the backedge-taken check
4100   // block and the middle block.
4101   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
4102                                         LoopScalarPreHeader->getTerminator());
4103   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4104     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4105   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4106 
4107   // Now, we need to fix the users of the reduction variable
4108   // inside and outside of the scalar remainder loop.
4109   // We know that the loop is in LCSSA form. We need to update the
4110   // PHI nodes in the exit blocks.
4111   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4112     // All PHINodes need to have a single entry edge, or two if
4113     // we already fixed them.
4114     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
4115 
4116     // We found a reduction value exit-PHI. Update it with the
4117     // incoming bypass edge.
4118     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
4119       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4120   } // end of the LCSSA phi scan.
4121 
4122     // Fix the scalar loop reduction variable with the incoming reduction sum
4123     // from the vector body and from the backedge value.
4124   int IncomingEdgeBlockIdx =
4125     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4126   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4127   // Pick the other block.
4128   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4129   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4130   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4131 }
4132 
4133 void InnerLoopVectorizer::clearReductionWrapFlags(
4134     RecurrenceDescriptor &RdxDesc) {
4135   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
4136   if (RK != RecurrenceDescriptor::RK_IntegerAdd &&
4137       RK != RecurrenceDescriptor::RK_IntegerMult)
4138     return;
4139 
4140   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4141   assert(LoopExitInstr && "null loop exit instruction");
4142   SmallVector<Instruction *, 8> Worklist;
4143   SmallPtrSet<Instruction *, 8> Visited;
4144   Worklist.push_back(LoopExitInstr);
4145   Visited.insert(LoopExitInstr);
4146 
4147   while (!Worklist.empty()) {
4148     Instruction *Cur = Worklist.pop_back_val();
4149     if (isa<OverflowingBinaryOperator>(Cur))
4150       for (unsigned Part = 0; Part < UF; ++Part) {
4151         Value *V = getOrCreateVectorValue(Cur, Part);
4152         cast<Instruction>(V)->dropPoisonGeneratingFlags();
4153       }
4154 
4155     for (User *U : Cur->users()) {
4156       Instruction *UI = cast<Instruction>(U);
4157       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4158           Visited.insert(UI).second)
4159         Worklist.push_back(UI);
4160     }
4161   }
4162 }
4163 
4164 void InnerLoopVectorizer::fixLCSSAPHIs() {
4165   assert(!VF.isScalable() && "the code below assumes fixed width vectors");
4166   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4167     if (LCSSAPhi.getNumIncomingValues() == 1) {
4168       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4169       // Non-instruction incoming values will have only one value.
4170       unsigned LastLane = 0;
4171       if (isa<Instruction>(IncomingValue))
4172         LastLane = Cost->isUniformAfterVectorization(
4173                        cast<Instruction>(IncomingValue), VF)
4174                        ? 0
4175                        : VF.getKnownMinValue() - 1;
4176       // Can be a loop invariant incoming value or the last scalar value to be
4177       // extracted from the vectorized loop.
4178       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4179       Value *lastIncomingValue =
4180           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
4181       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4182     }
4183   }
4184 }
4185 
4186 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4187   // The basic block and loop containing the predicated instruction.
4188   auto *PredBB = PredInst->getParent();
4189   auto *VectorLoop = LI->getLoopFor(PredBB);
4190 
4191   // Initialize a worklist with the operands of the predicated instruction.
4192   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4193 
4194   // Holds instructions that we need to analyze again. An instruction may be
4195   // reanalyzed if we don't yet know if we can sink it or not.
4196   SmallVector<Instruction *, 8> InstsToReanalyze;
4197 
4198   // Returns true if a given use occurs in the predicated block. Phi nodes use
4199   // their operands in their corresponding predecessor blocks.
4200   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4201     auto *I = cast<Instruction>(U.getUser());
4202     BasicBlock *BB = I->getParent();
4203     if (auto *Phi = dyn_cast<PHINode>(I))
4204       BB = Phi->getIncomingBlock(
4205           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4206     return BB == PredBB;
4207   };
4208 
4209   // Iteratively sink the scalarized operands of the predicated instruction
4210   // into the block we created for it. When an instruction is sunk, it's
4211   // operands are then added to the worklist. The algorithm ends after one pass
4212   // through the worklist doesn't sink a single instruction.
4213   bool Changed;
4214   do {
4215     // Add the instructions that need to be reanalyzed to the worklist, and
4216     // reset the changed indicator.
4217     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4218     InstsToReanalyze.clear();
4219     Changed = false;
4220 
4221     while (!Worklist.empty()) {
4222       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4223 
4224       // We can't sink an instruction if it is a phi node, is already in the
4225       // predicated block, is not in the loop, or may have side effects.
4226       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
4227           !VectorLoop->contains(I) || I->mayHaveSideEffects())
4228         continue;
4229 
4230       // It's legal to sink the instruction if all its uses occur in the
4231       // predicated block. Otherwise, there's nothing to do yet, and we may
4232       // need to reanalyze the instruction.
4233       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4234         InstsToReanalyze.push_back(I);
4235         continue;
4236       }
4237 
4238       // Move the instruction to the beginning of the predicated block, and add
4239       // it's operands to the worklist.
4240       I->moveBefore(&*PredBB->getFirstInsertionPt());
4241       Worklist.insert(I->op_begin(), I->op_end());
4242 
4243       // The sinking may have enabled other instructions to be sunk, so we will
4244       // need to iterate.
4245       Changed = true;
4246     }
4247   } while (Changed);
4248 }
4249 
4250 void InnerLoopVectorizer::fixNonInductionPHIs() {
4251   for (PHINode *OrigPhi : OrigPHIsToFix) {
4252     PHINode *NewPhi =
4253         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
4254     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
4255 
4256     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
4257         predecessors(OrigPhi->getParent()));
4258     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
4259         predecessors(NewPhi->getParent()));
4260     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
4261            "Scalar and Vector BB should have the same number of predecessors");
4262 
4263     // The insertion point in Builder may be invalidated by the time we get
4264     // here. Force the Builder insertion point to something valid so that we do
4265     // not run into issues during insertion point restore in
4266     // getOrCreateVectorValue calls below.
4267     Builder.SetInsertPoint(NewPhi);
4268 
4269     // The predecessor order is preserved and we can rely on mapping between
4270     // scalar and vector block predecessors.
4271     for (unsigned i = 0; i < NumIncomingValues; ++i) {
4272       BasicBlock *NewPredBB = VectorBBPredecessors[i];
4273 
4274       // When looking up the new scalar/vector values to fix up, use incoming
4275       // values from original phi.
4276       Value *ScIncV =
4277           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
4278 
4279       // Scalar incoming value may need a broadcast
4280       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
4281       NewPhi->addIncoming(NewIncV, NewPredBB);
4282     }
4283   }
4284 }
4285 
4286 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands,
4287                                    unsigned UF, ElementCount VF,
4288                                    bool IsPtrLoopInvariant,
4289                                    SmallBitVector &IsIndexLoopInvariant,
4290                                    VPTransformState &State) {
4291   // Construct a vector GEP by widening the operands of the scalar GEP as
4292   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4293   // results in a vector of pointers when at least one operand of the GEP
4294   // is vector-typed. Thus, to keep the representation compact, we only use
4295   // vector-typed operands for loop-varying values.
4296 
4297   if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4298     // If we are vectorizing, but the GEP has only loop-invariant operands,
4299     // the GEP we build (by only using vector-typed operands for
4300     // loop-varying values) would be a scalar pointer. Thus, to ensure we
4301     // produce a vector of pointers, we need to either arbitrarily pick an
4302     // operand to broadcast, or broadcast a clone of the original GEP.
4303     // Here, we broadcast a clone of the original.
4304     //
4305     // TODO: If at some point we decide to scalarize instructions having
4306     //       loop-invariant operands, this special case will no longer be
4307     //       required. We would add the scalarization decision to
4308     //       collectLoopScalars() and teach getVectorValue() to broadcast
4309     //       the lane-zero scalar value.
4310     auto *Clone = Builder.Insert(GEP->clone());
4311     for (unsigned Part = 0; Part < UF; ++Part) {
4312       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4313       VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart);
4314       addMetadata(EntryPart, GEP);
4315     }
4316   } else {
4317     // If the GEP has at least one loop-varying operand, we are sure to
4318     // produce a vector of pointers. But if we are only unrolling, we want
4319     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4320     // produce with the code below will be scalar (if VF == 1) or vector
4321     // (otherwise). Note that for the unroll-only case, we still maintain
4322     // values in the vector mapping with initVector, as we do for other
4323     // instructions.
4324     for (unsigned Part = 0; Part < UF; ++Part) {
4325       // The pointer operand of the new GEP. If it's loop-invariant, we
4326       // won't broadcast it.
4327       auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0})
4328                                      : State.get(Operands.getOperand(0), Part);
4329 
4330       // Collect all the indices for the new GEP. If any index is
4331       // loop-invariant, we won't broadcast it.
4332       SmallVector<Value *, 4> Indices;
4333       for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
4334         VPValue *Operand = Operands.getOperand(I);
4335         if (IsIndexLoopInvariant[I - 1])
4336           Indices.push_back(State.get(Operand, {0, 0}));
4337         else
4338           Indices.push_back(State.get(Operand, Part));
4339       }
4340 
4341       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4342       // but it should be a vector, otherwise.
4343       auto *NewGEP =
4344           GEP->isInBounds()
4345               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4346                                           Indices)
4347               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4348       assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
4349              "NewGEP is not a pointer vector");
4350       VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP);
4351       addMetadata(NewGEP, GEP);
4352     }
4353   }
4354 }
4355 
4356 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4357                                               ElementCount VF) {
4358   assert(!VF.isScalable() && "scalable vectors not yet supported.");
4359   PHINode *P = cast<PHINode>(PN);
4360   if (EnableVPlanNativePath) {
4361     // Currently we enter here in the VPlan-native path for non-induction
4362     // PHIs where all control flow is uniform. We simply widen these PHIs.
4363     // Create a vector phi with no operands - the vector phi operands will be
4364     // set at the end of vector code generation.
4365     Type *VecTy =
4366         (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF);
4367     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4368     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
4369     OrigPHIsToFix.push_back(P);
4370 
4371     return;
4372   }
4373 
4374   assert(PN->getParent() == OrigLoop->getHeader() &&
4375          "Non-header phis should have been handled elsewhere");
4376 
4377   // In order to support recurrences we need to be able to vectorize Phi nodes.
4378   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4379   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4380   // this value when we vectorize all of the instructions that use the PHI.
4381   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4382     for (unsigned Part = 0; Part < UF; ++Part) {
4383       // This is phase one of vectorizing PHIs.
4384       bool ScalarPHI =
4385           (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN));
4386       Type *VecTy =
4387           ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF);
4388       Value *EntryPart = PHINode::Create(
4389           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4390       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4391     }
4392     return;
4393   }
4394 
4395   setDebugLocFromInst(Builder, P);
4396 
4397   // This PHINode must be an induction variable.
4398   // Make sure that we know about it.
4399   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4400 
4401   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4402   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4403 
4404   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4405   // which can be found from the original scalar operations.
4406   switch (II.getKind()) {
4407   case InductionDescriptor::IK_NoInduction:
4408     llvm_unreachable("Unknown induction");
4409   case InductionDescriptor::IK_IntInduction:
4410   case InductionDescriptor::IK_FpInduction:
4411     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4412   case InductionDescriptor::IK_PtrInduction: {
4413     // Handle the pointer induction variable case.
4414     assert(P->getType()->isPointerTy() && "Unexpected type.");
4415 
4416     if (Cost->isScalarAfterVectorization(P, VF)) {
4417       // This is the normalized GEP that starts counting at zero.
4418       Value *PtrInd =
4419           Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
4420       // Determine the number of scalars we need to generate for each unroll
4421       // iteration. If the instruction is uniform, we only need to generate the
4422       // first lane. Otherwise, we generate all VF values.
4423       unsigned Lanes =
4424           Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue();
4425       for (unsigned Part = 0; Part < UF; ++Part) {
4426         for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4427           Constant *Idx = ConstantInt::get(PtrInd->getType(),
4428                                            Lane + Part * VF.getKnownMinValue());
4429           Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4430           Value *SclrGep =
4431               emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4432           SclrGep->setName("next.gep");
4433           VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4434         }
4435       }
4436       return;
4437     }
4438     assert(isa<SCEVConstant>(II.getStep()) &&
4439            "Induction step not a SCEV constant!");
4440     Type *PhiType = II.getStep()->getType();
4441 
4442     // Build a pointer phi
4443     Value *ScalarStartValue = II.getStartValue();
4444     Type *ScStValueType = ScalarStartValue->getType();
4445     PHINode *NewPointerPhi =
4446         PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
4447     NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4448 
4449     // A pointer induction, performed by using a gep
4450     BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4451     Instruction *InductionLoc = LoopLatch->getTerminator();
4452     const SCEV *ScalarStep = II.getStep();
4453     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4454     Value *ScalarStepValue =
4455         Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4456     Value *InductionGEP = GetElementPtrInst::Create(
4457         ScStValueType->getPointerElementType(), NewPointerPhi,
4458         Builder.CreateMul(
4459             ScalarStepValue,
4460             ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)),
4461         "ptr.ind", InductionLoc);
4462     NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4463 
4464     // Create UF many actual address geps that use the pointer
4465     // phi as base and a vectorized version of the step value
4466     // (<step*0, ..., step*N>) as offset.
4467     for (unsigned Part = 0; Part < UF; ++Part) {
4468       SmallVector<Constant *, 8> Indices;
4469       // Create a vector of consecutive numbers from zero to VF.
4470       for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
4471         Indices.push_back(
4472             ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue()));
4473       Constant *StartOffset = ConstantVector::get(Indices);
4474 
4475       Value *GEP = Builder.CreateGEP(
4476           ScStValueType->getPointerElementType(), NewPointerPhi,
4477           Builder.CreateMul(
4478               StartOffset,
4479               Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue),
4480               "vector.gep"));
4481       VectorLoopValueMap.setVectorValue(P, Part, GEP);
4482     }
4483   }
4484   }
4485 }
4486 
4487 /// A helper function for checking whether an integer division-related
4488 /// instruction may divide by zero (in which case it must be predicated if
4489 /// executed conditionally in the scalar code).
4490 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4491 /// Non-zero divisors that are non compile-time constants will not be
4492 /// converted into multiplication, so we will still end up scalarizing
4493 /// the division, but can do so w/o predication.
4494 static bool mayDivideByZero(Instruction &I) {
4495   assert((I.getOpcode() == Instruction::UDiv ||
4496           I.getOpcode() == Instruction::SDiv ||
4497           I.getOpcode() == Instruction::URem ||
4498           I.getOpcode() == Instruction::SRem) &&
4499          "Unexpected instruction");
4500   Value *Divisor = I.getOperand(1);
4501   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4502   return !CInt || CInt->isZero();
4503 }
4504 
4505 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User,
4506                                            VPTransformState &State) {
4507   assert(!VF.isScalable() && "scalable vectors not yet supported.");
4508   switch (I.getOpcode()) {
4509   case Instruction::Call:
4510   case Instruction::Br:
4511   case Instruction::PHI:
4512   case Instruction::GetElementPtr:
4513   case Instruction::Select:
4514     llvm_unreachable("This instruction is handled by a different recipe.");
4515   case Instruction::UDiv:
4516   case Instruction::SDiv:
4517   case Instruction::SRem:
4518   case Instruction::URem:
4519   case Instruction::Add:
4520   case Instruction::FAdd:
4521   case Instruction::Sub:
4522   case Instruction::FSub:
4523   case Instruction::FNeg:
4524   case Instruction::Mul:
4525   case Instruction::FMul:
4526   case Instruction::FDiv:
4527   case Instruction::FRem:
4528   case Instruction::Shl:
4529   case Instruction::LShr:
4530   case Instruction::AShr:
4531   case Instruction::And:
4532   case Instruction::Or:
4533   case Instruction::Xor: {
4534     // Just widen unops and binops.
4535     setDebugLocFromInst(Builder, &I);
4536 
4537     for (unsigned Part = 0; Part < UF; ++Part) {
4538       SmallVector<Value *, 2> Ops;
4539       for (VPValue *VPOp : User.operands())
4540         Ops.push_back(State.get(VPOp, Part));
4541 
4542       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4543 
4544       if (auto *VecOp = dyn_cast<Instruction>(V))
4545         VecOp->copyIRFlags(&I);
4546 
4547       // Use this vector value for all users of the original instruction.
4548       VectorLoopValueMap.setVectorValue(&I, Part, V);
4549       addMetadata(V, &I);
4550     }
4551 
4552     break;
4553   }
4554   case Instruction::ICmp:
4555   case Instruction::FCmp: {
4556     // Widen compares. Generate vector compares.
4557     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4558     auto *Cmp = cast<CmpInst>(&I);
4559     setDebugLocFromInst(Builder, Cmp);
4560     for (unsigned Part = 0; Part < UF; ++Part) {
4561       Value *A = State.get(User.getOperand(0), Part);
4562       Value *B = State.get(User.getOperand(1), Part);
4563       Value *C = nullptr;
4564       if (FCmp) {
4565         // Propagate fast math flags.
4566         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4567         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4568         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4569       } else {
4570         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4571       }
4572       VectorLoopValueMap.setVectorValue(&I, Part, C);
4573       addMetadata(C, &I);
4574     }
4575 
4576     break;
4577   }
4578 
4579   case Instruction::ZExt:
4580   case Instruction::SExt:
4581   case Instruction::FPToUI:
4582   case Instruction::FPToSI:
4583   case Instruction::FPExt:
4584   case Instruction::PtrToInt:
4585   case Instruction::IntToPtr:
4586   case Instruction::SIToFP:
4587   case Instruction::UIToFP:
4588   case Instruction::Trunc:
4589   case Instruction::FPTrunc:
4590   case Instruction::BitCast: {
4591     auto *CI = cast<CastInst>(&I);
4592     setDebugLocFromInst(Builder, CI);
4593 
4594     /// Vectorize casts.
4595     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
4596     Type *DestTy =
4597         (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF);
4598 
4599     for (unsigned Part = 0; Part < UF; ++Part) {
4600       Value *A = State.get(User.getOperand(0), Part);
4601       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4602       VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4603       addMetadata(Cast, &I);
4604     }
4605     break;
4606   }
4607   default:
4608     // This instruction is not vectorized by simple widening.
4609     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4610     llvm_unreachable("Unhandled instruction!");
4611   } // end of switch.
4612 }
4613 
4614 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands,
4615                                                VPTransformState &State) {
4616   assert(!isa<DbgInfoIntrinsic>(I) &&
4617          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4618   setDebugLocFromInst(Builder, &I);
4619 
4620   Module *M = I.getParent()->getParent()->getParent();
4621   auto *CI = cast<CallInst>(&I);
4622 
4623   SmallVector<Type *, 4> Tys;
4624   for (Value *ArgOperand : CI->arg_operands())
4625     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4626 
4627   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4628 
4629   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4630   // version of the instruction.
4631   // Is it beneficial to perform intrinsic call compared to lib call?
4632   bool NeedToScalarize = false;
4633   unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4634   bool UseVectorIntrinsic =
4635       ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4636   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4637          "Instruction should be scalarized elsewhere.");
4638 
4639   for (unsigned Part = 0; Part < UF; ++Part) {
4640     SmallVector<Value *, 4> Args;
4641     for (auto &I : enumerate(ArgOperands.operands())) {
4642       // Some intrinsics have a scalar argument - don't replace it with a
4643       // vector.
4644       Value *Arg;
4645       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4646         Arg = State.get(I.value(), Part);
4647       else
4648         Arg = State.get(I.value(), {0, 0});
4649       Args.push_back(Arg);
4650     }
4651 
4652     Function *VectorF;
4653     if (UseVectorIntrinsic) {
4654       // Use vector version of the intrinsic.
4655       Type *TysForDecl[] = {CI->getType()};
4656       if (VF.isVector()) {
4657         assert(!VF.isScalable() && "VF is assumed to be non scalable.");
4658         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4659       }
4660       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4661       assert(VectorF && "Can't retrieve vector intrinsic.");
4662     } else {
4663       // Use vector version of the function call.
4664       const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
4665 #ifndef NDEBUG
4666       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
4667              "Can't create vector function.");
4668 #endif
4669         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4670     }
4671       SmallVector<OperandBundleDef, 1> OpBundles;
4672       CI->getOperandBundlesAsDefs(OpBundles);
4673       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4674 
4675       if (isa<FPMathOperator>(V))
4676         V->copyFastMathFlags(CI);
4677 
4678       VectorLoopValueMap.setVectorValue(&I, Part, V);
4679       addMetadata(V, &I);
4680   }
4681 }
4682 
4683 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I,
4684                                                  VPUser &Operands,
4685                                                  bool InvariantCond,
4686                                                  VPTransformState &State) {
4687   setDebugLocFromInst(Builder, &I);
4688 
4689   // The condition can be loop invariant  but still defined inside the
4690   // loop. This means that we can't just use the original 'cond' value.
4691   // We have to take the 'vectorized' value and pick the first lane.
4692   // Instcombine will make this a no-op.
4693   auto *InvarCond =
4694       InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr;
4695 
4696   for (unsigned Part = 0; Part < UF; ++Part) {
4697     Value *Cond =
4698         InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
4699     Value *Op0 = State.get(Operands.getOperand(1), Part);
4700     Value *Op1 = State.get(Operands.getOperand(2), Part);
4701     Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
4702     VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4703     addMetadata(Sel, &I);
4704   }
4705 }
4706 
4707 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4708   // We should not collect Scalars more than once per VF. Right now, this
4709   // function is called from collectUniformsAndScalars(), which already does
4710   // this check. Collecting Scalars for VF=1 does not make any sense.
4711   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
4712          "This function should not be visited twice for the same VF");
4713 
4714   SmallSetVector<Instruction *, 8> Worklist;
4715 
4716   // These sets are used to seed the analysis with pointers used by memory
4717   // accesses that will remain scalar.
4718   SmallSetVector<Instruction *, 8> ScalarPtrs;
4719   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4720   auto *Latch = TheLoop->getLoopLatch();
4721 
4722   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4723   // The pointer operands of loads and stores will be scalar as long as the
4724   // memory access is not a gather or scatter operation. The value operand of a
4725   // store will remain scalar if the store is scalarized.
4726   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4727     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4728     assert(WideningDecision != CM_Unknown &&
4729            "Widening decision should be ready at this moment");
4730     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4731       if (Ptr == Store->getValueOperand())
4732         return WideningDecision == CM_Scalarize;
4733     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4734            "Ptr is neither a value or pointer operand");
4735     return WideningDecision != CM_GatherScatter;
4736   };
4737 
4738   // A helper that returns true if the given value is a bitcast or
4739   // getelementptr instruction contained in the loop.
4740   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4741     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4742             isa<GetElementPtrInst>(V)) &&
4743            !TheLoop->isLoopInvariant(V);
4744   };
4745 
4746   auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
4747     if (!isa<PHINode>(Ptr) ||
4748         !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
4749       return false;
4750     auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
4751     if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
4752       return false;
4753     return isScalarUse(MemAccess, Ptr);
4754   };
4755 
4756   // A helper that evaluates a memory access's use of a pointer. If the
4757   // pointer is actually the pointer induction of a loop, it is being
4758   // inserted into Worklist. If the use will be a scalar use, and the
4759   // pointer is only used by memory accesses, we place the pointer in
4760   // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
4761   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4762     if (isScalarPtrInduction(MemAccess, Ptr)) {
4763       Worklist.insert(cast<Instruction>(Ptr));
4764       Instruction *Update = cast<Instruction>(
4765           cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
4766       Worklist.insert(Update);
4767       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
4768                         << "\n");
4769       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update
4770                         << "\n");
4771       return;
4772     }
4773     // We only care about bitcast and getelementptr instructions contained in
4774     // the loop.
4775     if (!isLoopVaryingBitCastOrGEP(Ptr))
4776       return;
4777 
4778     // If the pointer has already been identified as scalar (e.g., if it was
4779     // also identified as uniform), there's nothing to do.
4780     auto *I = cast<Instruction>(Ptr);
4781     if (Worklist.count(I))
4782       return;
4783 
4784     // If the use of the pointer will be a scalar use, and all users of the
4785     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4786     // place the pointer in PossibleNonScalarPtrs.
4787     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4788           return isa<LoadInst>(U) || isa<StoreInst>(U);
4789         }))
4790       ScalarPtrs.insert(I);
4791     else
4792       PossibleNonScalarPtrs.insert(I);
4793   };
4794 
4795   // We seed the scalars analysis with three classes of instructions: (1)
4796   // instructions marked uniform-after-vectorization and (2) bitcast,
4797   // getelementptr and (pointer) phi instructions used by memory accesses
4798   // requiring a scalar use.
4799   //
4800   // (1) Add to the worklist all instructions that have been identified as
4801   // uniform-after-vectorization.
4802   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4803 
4804   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4805   // memory accesses requiring a scalar use. The pointer operands of loads and
4806   // stores will be scalar as long as the memory accesses is not a gather or
4807   // scatter operation. The value operand of a store will remain scalar if the
4808   // store is scalarized.
4809   for (auto *BB : TheLoop->blocks())
4810     for (auto &I : *BB) {
4811       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4812         evaluatePtrUse(Load, Load->getPointerOperand());
4813       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4814         evaluatePtrUse(Store, Store->getPointerOperand());
4815         evaluatePtrUse(Store, Store->getValueOperand());
4816       }
4817     }
4818   for (auto *I : ScalarPtrs)
4819     if (!PossibleNonScalarPtrs.count(I)) {
4820       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4821       Worklist.insert(I);
4822     }
4823 
4824   // Insert the forced scalars.
4825   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4826   // induction variable when the PHI user is scalarized.
4827   auto ForcedScalar = ForcedScalars.find(VF);
4828   if (ForcedScalar != ForcedScalars.end())
4829     for (auto *I : ForcedScalar->second)
4830       Worklist.insert(I);
4831 
4832   // Expand the worklist by looking through any bitcasts and getelementptr
4833   // instructions we've already identified as scalar. This is similar to the
4834   // expansion step in collectLoopUniforms(); however, here we're only
4835   // expanding to include additional bitcasts and getelementptr instructions.
4836   unsigned Idx = 0;
4837   while (Idx != Worklist.size()) {
4838     Instruction *Dst = Worklist[Idx++];
4839     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4840       continue;
4841     auto *Src = cast<Instruction>(Dst->getOperand(0));
4842     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4843           auto *J = cast<Instruction>(U);
4844           return !TheLoop->contains(J) || Worklist.count(J) ||
4845                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4846                   isScalarUse(J, Src));
4847         })) {
4848       Worklist.insert(Src);
4849       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4850     }
4851   }
4852 
4853   // An induction variable will remain scalar if all users of the induction
4854   // variable and induction variable update remain scalar.
4855   for (auto &Induction : Legal->getInductionVars()) {
4856     auto *Ind = Induction.first;
4857     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4858 
4859     // If tail-folding is applied, the primary induction variable will be used
4860     // to feed a vector compare.
4861     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4862       continue;
4863 
4864     // Determine if all users of the induction variable are scalar after
4865     // vectorization.
4866     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4867       auto *I = cast<Instruction>(U);
4868       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4869     });
4870     if (!ScalarInd)
4871       continue;
4872 
4873     // Determine if all users of the induction variable update instruction are
4874     // scalar after vectorization.
4875     auto ScalarIndUpdate =
4876         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4877           auto *I = cast<Instruction>(U);
4878           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4879         });
4880     if (!ScalarIndUpdate)
4881       continue;
4882 
4883     // The induction variable and its update instruction will remain scalar.
4884     Worklist.insert(Ind);
4885     Worklist.insert(IndUpdate);
4886     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4887     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4888                       << "\n");
4889   }
4890 
4891   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4892 }
4893 
4894 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I,
4895                                                          ElementCount VF) {
4896   assert(!VF.isScalable() && "scalable vectors not yet supported.");
4897   if (!blockNeedsPredication(I->getParent()))
4898     return false;
4899   switch(I->getOpcode()) {
4900   default:
4901     break;
4902   case Instruction::Load:
4903   case Instruction::Store: {
4904     if (!Legal->isMaskRequired(I))
4905       return false;
4906     auto *Ptr = getLoadStorePointerOperand(I);
4907     auto *Ty = getMemInstValueType(I);
4908     // We have already decided how to vectorize this instruction, get that
4909     // result.
4910     if (VF.isVector()) {
4911       InstWidening WideningDecision = getWideningDecision(I, VF);
4912       assert(WideningDecision != CM_Unknown &&
4913              "Widening decision should be ready at this moment");
4914       return WideningDecision == CM_Scalarize;
4915     }
4916     const Align Alignment = getLoadStoreAlignment(I);
4917     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4918                                 isLegalMaskedGather(Ty, Alignment))
4919                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4920                                 isLegalMaskedScatter(Ty, Alignment));
4921   }
4922   case Instruction::UDiv:
4923   case Instruction::SDiv:
4924   case Instruction::SRem:
4925   case Instruction::URem:
4926     return mayDivideByZero(*I);
4927   }
4928   return false;
4929 }
4930 
4931 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4932     Instruction *I, ElementCount VF) {
4933   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4934   assert(getWideningDecision(I, VF) == CM_Unknown &&
4935          "Decision should not be set yet.");
4936   auto *Group = getInterleavedAccessGroup(I);
4937   assert(Group && "Must have a group.");
4938 
4939   // If the instruction's allocated size doesn't equal it's type size, it
4940   // requires padding and will be scalarized.
4941   auto &DL = I->getModule()->getDataLayout();
4942   auto *ScalarTy = getMemInstValueType(I);
4943   if (hasIrregularType(ScalarTy, DL, VF))
4944     return false;
4945 
4946   // Check if masking is required.
4947   // A Group may need masking for one of two reasons: it resides in a block that
4948   // needs predication, or it was decided to use masking to deal with gaps.
4949   bool PredicatedAccessRequiresMasking =
4950       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4951   bool AccessWithGapsRequiresMasking =
4952       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4953   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4954     return true;
4955 
4956   // If masked interleaving is required, we expect that the user/target had
4957   // enabled it, because otherwise it either wouldn't have been created or
4958   // it should have been invalidated by the CostModel.
4959   assert(useMaskedInterleavedAccesses(TTI) &&
4960          "Masked interleave-groups for predicated accesses are not enabled.");
4961 
4962   auto *Ty = getMemInstValueType(I);
4963   const Align Alignment = getLoadStoreAlignment(I);
4964   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4965                           : TTI.isLegalMaskedStore(Ty, Alignment);
4966 }
4967 
4968 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4969     Instruction *I, ElementCount VF) {
4970   // Get and ensure we have a valid memory instruction.
4971   LoadInst *LI = dyn_cast<LoadInst>(I);
4972   StoreInst *SI = dyn_cast<StoreInst>(I);
4973   assert((LI || SI) && "Invalid memory instruction");
4974 
4975   auto *Ptr = getLoadStorePointerOperand(I);
4976 
4977   // In order to be widened, the pointer should be consecutive, first of all.
4978   if (!Legal->isConsecutivePtr(Ptr))
4979     return false;
4980 
4981   // If the instruction is a store located in a predicated block, it will be
4982   // scalarized.
4983   if (isScalarWithPredication(I))
4984     return false;
4985 
4986   // If the instruction's allocated size doesn't equal it's type size, it
4987   // requires padding and will be scalarized.
4988   auto &DL = I->getModule()->getDataLayout();
4989   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4990   if (hasIrregularType(ScalarTy, DL, VF))
4991     return false;
4992 
4993   return true;
4994 }
4995 
4996 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
4997   // We should not collect Uniforms more than once per VF. Right now,
4998   // this function is called from collectUniformsAndScalars(), which
4999   // already does this check. Collecting Uniforms for VF=1 does not make any
5000   // sense.
5001 
5002   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
5003          "This function should not be visited twice for the same VF");
5004 
5005   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
5006   // not analyze again.  Uniforms.count(VF) will return 1.
5007   Uniforms[VF].clear();
5008 
5009   // We now know that the loop is vectorizable!
5010   // Collect instructions inside the loop that will remain uniform after
5011   // vectorization.
5012 
5013   // Global values, params and instructions outside of current loop are out of
5014   // scope.
5015   auto isOutOfScope = [&](Value *V) -> bool {
5016     Instruction *I = dyn_cast<Instruction>(V);
5017     return (!I || !TheLoop->contains(I));
5018   };
5019 
5020   SetVector<Instruction *> Worklist;
5021   BasicBlock *Latch = TheLoop->getLoopLatch();
5022 
5023   // Instructions that are scalar with predication must not be considered
5024   // uniform after vectorization, because that would create an erroneous
5025   // replicating region where only a single instance out of VF should be formed.
5026   // TODO: optimize such seldom cases if found important, see PR40816.
5027   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
5028     if (isScalarWithPredication(I, VF)) {
5029       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
5030                         << *I << "\n");
5031       return;
5032     }
5033     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
5034     Worklist.insert(I);
5035   };
5036 
5037   // Start with the conditional branch. If the branch condition is an
5038   // instruction contained in the loop that is only used by the branch, it is
5039   // uniform.
5040   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
5041   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
5042     addToWorklistIfAllowed(Cmp);
5043 
5044   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
5045   // are pointers that are treated like consecutive pointers during
5046   // vectorization. The pointer operands of interleaved accesses are an
5047   // example.
5048   SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
5049 
5050   // Holds pointer operands of instructions that are possibly non-uniform.
5051   SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
5052 
5053   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
5054     InstWidening WideningDecision = getWideningDecision(I, VF);
5055     assert(WideningDecision != CM_Unknown &&
5056            "Widening decision should be ready at this moment");
5057 
5058     return (WideningDecision == CM_Widen ||
5059             WideningDecision == CM_Widen_Reverse ||
5060             WideningDecision == CM_Interleave);
5061   };
5062   // Iterate over the instructions in the loop, and collect all
5063   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
5064   // that a consecutive-like pointer operand will be scalarized, we collect it
5065   // in PossibleNonUniformPtrs instead. We use two sets here because a single
5066   // getelementptr instruction can be used by both vectorized and scalarized
5067   // memory instructions. For example, if a loop loads and stores from the same
5068   // location, but the store is conditional, the store will be scalarized, and
5069   // the getelementptr won't remain uniform.
5070   for (auto *BB : TheLoop->blocks())
5071     for (auto &I : *BB) {
5072       // If there's no pointer operand, there's nothing to do.
5073       auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
5074       if (!Ptr)
5075         continue;
5076 
5077       // True if all users of Ptr are memory accesses that have Ptr as their
5078       // pointer operand.
5079       auto UsersAreMemAccesses =
5080           llvm::all_of(Ptr->users(), [&](User *U) -> bool {
5081             return getLoadStorePointerOperand(U) == Ptr;
5082           });
5083 
5084       // Ensure the memory instruction will not be scalarized or used by
5085       // gather/scatter, making its pointer operand non-uniform. If the pointer
5086       // operand is used by any instruction other than a memory access, we
5087       // conservatively assume the pointer operand may be non-uniform.
5088       if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
5089         PossibleNonUniformPtrs.insert(Ptr);
5090 
5091       // If the memory instruction will be vectorized and its pointer operand
5092       // is consecutive-like, or interleaving - the pointer operand should
5093       // remain uniform.
5094       else
5095         ConsecutiveLikePtrs.insert(Ptr);
5096     }
5097 
5098   // Add to the Worklist all consecutive and consecutive-like pointers that
5099   // aren't also identified as possibly non-uniform.
5100   for (auto *V : ConsecutiveLikePtrs)
5101     if (!PossibleNonUniformPtrs.count(V))
5102       addToWorklistIfAllowed(V);
5103 
5104   // Expand Worklist in topological order: whenever a new instruction
5105   // is added , its users should be already inside Worklist.  It ensures
5106   // a uniform instruction will only be used by uniform instructions.
5107   unsigned idx = 0;
5108   while (idx != Worklist.size()) {
5109     Instruction *I = Worklist[idx++];
5110 
5111     for (auto OV : I->operand_values()) {
5112       // isOutOfScope operands cannot be uniform instructions.
5113       if (isOutOfScope(OV))
5114         continue;
5115       // First order recurrence Phi's should typically be considered
5116       // non-uniform.
5117       auto *OP = dyn_cast<PHINode>(OV);
5118       if (OP && Legal->isFirstOrderRecurrence(OP))
5119         continue;
5120       // If all the users of the operand are uniform, then add the
5121       // operand into the uniform worklist.
5122       auto *OI = cast<Instruction>(OV);
5123       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
5124             auto *J = cast<Instruction>(U);
5125             return Worklist.count(J) ||
5126                    (OI == getLoadStorePointerOperand(J) &&
5127                     isUniformDecision(J, VF));
5128           }))
5129         addToWorklistIfAllowed(OI);
5130     }
5131   }
5132 
5133   // Returns true if Ptr is the pointer operand of a memory access instruction
5134   // I, and I is known to not require scalarization.
5135   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5136     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
5137   };
5138 
5139   // For an instruction to be added into Worklist above, all its users inside
5140   // the loop should also be in Worklist. However, this condition cannot be
5141   // true for phi nodes that form a cyclic dependence. We must process phi
5142   // nodes separately. An induction variable will remain uniform if all users
5143   // of the induction variable and induction variable update remain uniform.
5144   // The code below handles both pointer and non-pointer induction variables.
5145   for (auto &Induction : Legal->getInductionVars()) {
5146     auto *Ind = Induction.first;
5147     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5148 
5149     // Determine if all users of the induction variable are uniform after
5150     // vectorization.
5151     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5152       auto *I = cast<Instruction>(U);
5153       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
5154              isVectorizedMemAccessUse(I, Ind);
5155     });
5156     if (!UniformInd)
5157       continue;
5158 
5159     // Determine if all users of the induction variable update instruction are
5160     // uniform after vectorization.
5161     auto UniformIndUpdate =
5162         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5163           auto *I = cast<Instruction>(U);
5164           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
5165                  isVectorizedMemAccessUse(I, IndUpdate);
5166         });
5167     if (!UniformIndUpdate)
5168       continue;
5169 
5170     // The induction variable and its update instruction will remain uniform.
5171     addToWorklistIfAllowed(Ind);
5172     addToWorklistIfAllowed(IndUpdate);
5173   }
5174 
5175   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
5176 }
5177 
5178 bool LoopVectorizationCostModel::runtimeChecksRequired() {
5179   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
5180 
5181   if (Legal->getRuntimePointerChecking()->Need) {
5182     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
5183         "runtime pointer checks needed. Enable vectorization of this "
5184         "loop with '#pragma clang loop vectorize(enable)' when "
5185         "compiling with -Os/-Oz",
5186         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5187     return true;
5188   }
5189 
5190   if (!PSE.getUnionPredicate().getPredicates().empty()) {
5191     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
5192         "runtime SCEV checks needed. Enable vectorization of this "
5193         "loop with '#pragma clang loop vectorize(enable)' when "
5194         "compiling with -Os/-Oz",
5195         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5196     return true;
5197   }
5198 
5199   // FIXME: Avoid specializing for stride==1 instead of bailing out.
5200   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
5201     reportVectorizationFailure("Runtime stride check for small trip count",
5202         "runtime stride == 1 checks needed. Enable vectorization of "
5203         "this loop without such check by compiling with -Os/-Oz",
5204         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5205     return true;
5206   }
5207 
5208   return false;
5209 }
5210 
5211 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF,
5212                                                             unsigned UserIC) {
5213   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5214     // TODO: It may by useful to do since it's still likely to be dynamically
5215     // uniform if the target can skip.
5216     reportVectorizationFailure(
5217         "Not inserting runtime ptr check for divergent target",
5218         "runtime pointer checks needed. Not enabled for divergent target",
5219         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5220     return None;
5221   }
5222 
5223   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5224   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5225   if (TC == 1) {
5226     reportVectorizationFailure("Single iteration (non) loop",
5227         "loop trip count is one, irrelevant for vectorization",
5228         "SingleIterationLoop", ORE, TheLoop);
5229     return None;
5230   }
5231 
5232   switch (ScalarEpilogueStatus) {
5233   case CM_ScalarEpilogueAllowed:
5234     return UserVF ? UserVF : computeFeasibleMaxVF(TC);
5235   case CM_ScalarEpilogueNotNeededUsePredicate:
5236     LLVM_DEBUG(
5237         dbgs() << "LV: vector predicate hint/switch found.\n"
5238                << "LV: Not allowing scalar epilogue, creating predicated "
5239                << "vector loop.\n");
5240     break;
5241   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5242     // fallthrough as a special case of OptForSize
5243   case CM_ScalarEpilogueNotAllowedOptSize:
5244     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5245       LLVM_DEBUG(
5246           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5247     else
5248       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5249                         << "count.\n");
5250 
5251     // Bail if runtime checks are required, which are not good when optimising
5252     // for size.
5253     if (runtimeChecksRequired())
5254       return None;
5255     break;
5256   }
5257 
5258   // Now try the tail folding
5259 
5260   // Invalidate interleave groups that require an epilogue if we can't mask
5261   // the interleave-group.
5262   if (!useMaskedInterleavedAccesses(TTI)) {
5263     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5264            "No decisions should have been taken at this point");
5265     // Note: There is no need to invalidate any cost modeling decisions here, as
5266     // non where taken so far.
5267     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5268   }
5269 
5270   unsigned MaxVF = UserVF ? UserVF : computeFeasibleMaxVF(TC);
5271   assert((UserVF || isPowerOf2_32(MaxVF)) && "MaxVF must be a power of 2");
5272   unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF;
5273   if (TC > 0 && TC % MaxVFtimesIC == 0) {
5274     // Accept MaxVF if we do not have a tail.
5275     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5276     return MaxVF;
5277   }
5278 
5279   // If we don't know the precise trip count, or if the trip count that we
5280   // found modulo the vectorization factor is not zero, try to fold the tail
5281   // by masking.
5282   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5283   if (Legal->prepareToFoldTailByMasking()) {
5284     FoldTailByMasking = true;
5285     return MaxVF;
5286   }
5287 
5288   // If there was a tail-folding hint/switch, but we can't fold the tail by
5289   // masking, fallback to a vectorization with a scalar epilogue.
5290   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5291     if (PreferPredicateOverEpilogue == PreferPredicateTy::PredicateOrDontVectorize) {
5292       LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5293       return None;
5294     }
5295     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5296                          "scalar epilogue instead.\n");
5297     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5298     return MaxVF;
5299   }
5300 
5301   if (TC == 0) {
5302     reportVectorizationFailure(
5303         "Unable to calculate the loop count due to complex control flow",
5304         "unable to calculate the loop count due to complex control flow",
5305         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5306     return None;
5307   }
5308 
5309   reportVectorizationFailure(
5310       "Cannot optimize for size and vectorize at the same time.",
5311       "cannot optimize for size and vectorize at the same time. "
5312       "Enable vectorization of this loop with '#pragma clang loop "
5313       "vectorize(enable)' when compiling with -Os/-Oz",
5314       "NoTailLoopWithOptForSize", ORE, TheLoop);
5315   return None;
5316 }
5317 
5318 unsigned
5319 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
5320   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5321   unsigned SmallestType, WidestType;
5322   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5323   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
5324 
5325   // Get the maximum safe dependence distance in bits computed by LAA.
5326   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5327   // the memory accesses that is most restrictive (involved in the smallest
5328   // dependence distance).
5329   unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
5330 
5331   WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
5332 
5333   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5334   // Note that both WidestRegister and WidestType may not be a powers of 2.
5335   unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType);
5336 
5337   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5338                     << " / " << WidestType << " bits.\n");
5339   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5340                     << WidestRegister << " bits.\n");
5341 
5342   assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
5343                                  " into one vector!");
5344   if (MaxVectorSize == 0) {
5345     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
5346     MaxVectorSize = 1;
5347     return MaxVectorSize;
5348   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
5349              isPowerOf2_32(ConstTripCount)) {
5350     // We need to clamp the VF to be the ConstTripCount. There is no point in
5351     // choosing a higher viable VF as done in the loop below.
5352     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5353                       << ConstTripCount << "\n");
5354     MaxVectorSize = ConstTripCount;
5355     return MaxVectorSize;
5356   }
5357 
5358   unsigned MaxVF = MaxVectorSize;
5359   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5360       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5361     // Collect all viable vectorization factors larger than the default MaxVF
5362     // (i.e. MaxVectorSize).
5363     SmallVector<ElementCount, 8> VFs;
5364     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
5365     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
5366       VFs.push_back(ElementCount::getFixed(VS));
5367 
5368     // For each VF calculate its register usage.
5369     auto RUs = calculateRegisterUsage(VFs);
5370 
5371     // Select the largest VF which doesn't require more registers than existing
5372     // ones.
5373     for (int i = RUs.size() - 1; i >= 0; --i) {
5374       bool Selected = true;
5375       for (auto& pair : RUs[i].MaxLocalUsers) {
5376         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5377         if (pair.second > TargetNumRegisters)
5378           Selected = false;
5379       }
5380       if (Selected) {
5381         MaxVF = VFs[i].getKnownMinValue();
5382         break;
5383       }
5384     }
5385     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
5386       if (MaxVF < MinVF) {
5387         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5388                           << ") with target's minimum: " << MinVF << '\n');
5389         MaxVF = MinVF;
5390       }
5391     }
5392   }
5393   return MaxVF;
5394 }
5395 
5396 VectorizationFactor
5397 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
5398   float Cost = expectedCost(ElementCount::getFixed(1)).first;
5399   const float ScalarCost = Cost;
5400   unsigned Width = 1;
5401   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
5402 
5403   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5404   if (ForceVectorization && MaxVF > 1) {
5405     // Ignore scalar width, because the user explicitly wants vectorization.
5406     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5407     // evaluation.
5408     Cost = std::numeric_limits<float>::max();
5409   }
5410 
5411   for (unsigned i = 2; i <= MaxVF; i *= 2) {
5412     // Notice that the vector loop needs to be executed less times, so
5413     // we need to divide the cost of the vector loops by the width of
5414     // the vector elements.
5415     VectorizationCostTy C = expectedCost(ElementCount::getFixed(i));
5416     float VectorCost = C.first / (float)i;
5417     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5418                       << " costs: " << (int)VectorCost << ".\n");
5419     if (!C.second && !ForceVectorization) {
5420       LLVM_DEBUG(
5421           dbgs() << "LV: Not considering vector loop of width " << i
5422                  << " because it will not generate any vector instructions.\n");
5423       continue;
5424     }
5425     if (VectorCost < Cost) {
5426       Cost = VectorCost;
5427       Width = i;
5428     }
5429   }
5430 
5431   if (!EnableCondStoresVectorization && NumPredStores) {
5432     reportVectorizationFailure("There are conditional stores.",
5433         "store that is conditionally executed prevents vectorization",
5434         "ConditionalStore", ORE, TheLoop);
5435     Width = 1;
5436     Cost = ScalarCost;
5437   }
5438 
5439   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5440              << "LV: Vectorization seems to be not beneficial, "
5441              << "but was forced by a user.\n");
5442   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5443   VectorizationFactor Factor = {ElementCount::getFixed(Width),
5444                                 (unsigned)(Width * Cost)};
5445   return Factor;
5446 }
5447 
5448 std::pair<unsigned, unsigned>
5449 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5450   unsigned MinWidth = -1U;
5451   unsigned MaxWidth = 8;
5452   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5453 
5454   // For each block.
5455   for (BasicBlock *BB : TheLoop->blocks()) {
5456     // For each instruction in the loop.
5457     for (Instruction &I : BB->instructionsWithoutDebug()) {
5458       Type *T = I.getType();
5459 
5460       // Skip ignored values.
5461       if (ValuesToIgnore.count(&I))
5462         continue;
5463 
5464       // Only examine Loads, Stores and PHINodes.
5465       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5466         continue;
5467 
5468       // Examine PHI nodes that are reduction variables. Update the type to
5469       // account for the recurrence type.
5470       if (auto *PN = dyn_cast<PHINode>(&I)) {
5471         if (!Legal->isReductionVariable(PN))
5472           continue;
5473         RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
5474         T = RdxDesc.getRecurrenceType();
5475       }
5476 
5477       // Examine the stored values.
5478       if (auto *ST = dyn_cast<StoreInst>(&I))
5479         T = ST->getValueOperand()->getType();
5480 
5481       // Ignore loaded pointer types and stored pointer types that are not
5482       // vectorizable.
5483       //
5484       // FIXME: The check here attempts to predict whether a load or store will
5485       //        be vectorized. We only know this for certain after a VF has
5486       //        been selected. Here, we assume that if an access can be
5487       //        vectorized, it will be. We should also look at extending this
5488       //        optimization to non-pointer types.
5489       //
5490       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5491           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5492         continue;
5493 
5494       MinWidth = std::min(MinWidth,
5495                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5496       MaxWidth = std::max(MaxWidth,
5497                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5498     }
5499   }
5500 
5501   return {MinWidth, MaxWidth};
5502 }
5503 
5504 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5505                                                            unsigned LoopCost) {
5506   // -- The interleave heuristics --
5507   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5508   // There are many micro-architectural considerations that we can't predict
5509   // at this level. For example, frontend pressure (on decode or fetch) due to
5510   // code size, or the number and capabilities of the execution ports.
5511   //
5512   // We use the following heuristics to select the interleave count:
5513   // 1. If the code has reductions, then we interleave to break the cross
5514   // iteration dependency.
5515   // 2. If the loop is really small, then we interleave to reduce the loop
5516   // overhead.
5517   // 3. We don't interleave if we think that we will spill registers to memory
5518   // due to the increased register pressure.
5519 
5520   if (!isScalarEpilogueAllowed())
5521     return 1;
5522 
5523   // We used the distance for the interleave count.
5524   if (Legal->getMaxSafeDepDistBytes() != -1U)
5525     return 1;
5526 
5527   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5528   const bool HasReductions = !Legal->getReductionVars().empty();
5529   // Do not interleave loops with a relatively small known or estimated trip
5530   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
5531   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
5532   // because with the above conditions interleaving can expose ILP and break
5533   // cross iteration dependences for reductions.
5534   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
5535       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
5536     return 1;
5537 
5538   RegisterUsage R = calculateRegisterUsage({VF})[0];
5539   // We divide by these constants so assume that we have at least one
5540   // instruction that uses at least one register.
5541   for (auto& pair : R.MaxLocalUsers) {
5542     pair.second = std::max(pair.second, 1U);
5543   }
5544 
5545   // We calculate the interleave count using the following formula.
5546   // Subtract the number of loop invariants from the number of available
5547   // registers. These registers are used by all of the interleaved instances.
5548   // Next, divide the remaining registers by the number of registers that is
5549   // required by the loop, in order to estimate how many parallel instances
5550   // fit without causing spills. All of this is rounded down if necessary to be
5551   // a power of two. We want power of two interleave count to simplify any
5552   // addressing operations or alignment considerations.
5553   // We also want power of two interleave counts to ensure that the induction
5554   // variable of the vector loop wraps to zero, when tail is folded by masking;
5555   // this currently happens when OptForSize, in which case IC is set to 1 above.
5556   unsigned IC = UINT_MAX;
5557 
5558   for (auto& pair : R.MaxLocalUsers) {
5559     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5560     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5561                       << " registers of "
5562                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5563     if (VF.isScalar()) {
5564       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5565         TargetNumRegisters = ForceTargetNumScalarRegs;
5566     } else {
5567       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5568         TargetNumRegisters = ForceTargetNumVectorRegs;
5569     }
5570     unsigned MaxLocalUsers = pair.second;
5571     unsigned LoopInvariantRegs = 0;
5572     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5573       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5574 
5575     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5576     // Don't count the induction variable as interleaved.
5577     if (EnableIndVarRegisterHeur) {
5578       TmpIC =
5579           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5580                         std::max(1U, (MaxLocalUsers - 1)));
5581     }
5582 
5583     IC = std::min(IC, TmpIC);
5584   }
5585 
5586   // Clamp the interleave ranges to reasonable counts.
5587   assert(!VF.isScalable() && "scalable vectors not yet supported.");
5588   unsigned MaxInterleaveCount =
5589       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
5590 
5591   // Check if the user has overridden the max.
5592   if (VF.isScalar()) {
5593     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5594       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5595   } else {
5596     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5597       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5598   }
5599 
5600   // If trip count is known or estimated compile time constant, limit the
5601   // interleave count to be less than the trip count divided by VF.
5602   if (BestKnownTC) {
5603     MaxInterleaveCount =
5604         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
5605   }
5606 
5607   // If we did not calculate the cost for VF (because the user selected the VF)
5608   // then we calculate the cost of VF here.
5609   if (LoopCost == 0)
5610     LoopCost = expectedCost(VF).first;
5611 
5612   assert(LoopCost && "Non-zero loop cost expected");
5613 
5614   // Clamp the calculated IC to be between the 1 and the max interleave count
5615   // that the target and trip count allows.
5616   if (IC > MaxInterleaveCount)
5617     IC = MaxInterleaveCount;
5618   else if (IC < 1)
5619     IC = 1;
5620 
5621   // Interleave if we vectorized this loop and there is a reduction that could
5622   // benefit from interleaving.
5623   if (VF.isVector() && HasReductions) {
5624     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5625     return IC;
5626   }
5627 
5628   // Note that if we've already vectorized the loop we will have done the
5629   // runtime check and so interleaving won't require further checks.
5630   bool InterleavingRequiresRuntimePointerCheck =
5631       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
5632 
5633   // We want to interleave small loops in order to reduce the loop overhead and
5634   // potentially expose ILP opportunities.
5635   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5636                     << "LV: IC is " << IC << '\n'
5637                     << "LV: VF is " << VF.getKnownMinValue() << '\n');
5638   const bool AggressivelyInterleaveReductions =
5639       TTI.enableAggressiveInterleaving(HasReductions);
5640   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5641     // We assume that the cost overhead is 1 and we use the cost model
5642     // to estimate the cost of the loop and interleave until the cost of the
5643     // loop overhead is about 5% of the cost of the loop.
5644     unsigned SmallIC =
5645         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5646 
5647     // Interleave until store/load ports (estimated by max interleave count) are
5648     // saturated.
5649     unsigned NumStores = Legal->getNumStores();
5650     unsigned NumLoads = Legal->getNumLoads();
5651     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5652     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5653 
5654     // If we have a scalar reduction (vector reductions are already dealt with
5655     // by this point), we can increase the critical path length if the loop
5656     // we're interleaving is inside another loop. Limit, by default to 2, so the
5657     // critical path only gets increased by one reduction operation.
5658     if (HasReductions && TheLoop->getLoopDepth() > 1) {
5659       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5660       SmallIC = std::min(SmallIC, F);
5661       StoresIC = std::min(StoresIC, F);
5662       LoadsIC = std::min(LoadsIC, F);
5663     }
5664 
5665     if (EnableLoadStoreRuntimeInterleave &&
5666         std::max(StoresIC, LoadsIC) > SmallIC) {
5667       LLVM_DEBUG(
5668           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5669       return std::max(StoresIC, LoadsIC);
5670     }
5671 
5672     // If there are scalar reductions and TTI has enabled aggressive
5673     // interleaving for reductions, we will interleave to expose ILP.
5674     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
5675         AggressivelyInterleaveReductions) {
5676       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5677       // Interleave no less than SmallIC but not as aggressive as the normal IC
5678       // to satisfy the rare situation when resources are too limited.
5679       return std::max(IC / 2, SmallIC);
5680     } else {
5681       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5682       return SmallIC;
5683     }
5684   }
5685 
5686   // Interleave if this is a large loop (small loops are already dealt with by
5687   // this point) that could benefit from interleaving.
5688   if (AggressivelyInterleaveReductions) {
5689     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5690     return IC;
5691   }
5692 
5693   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5694   return 1;
5695 }
5696 
5697 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5698 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
5699   // This function calculates the register usage by measuring the highest number
5700   // of values that are alive at a single location. Obviously, this is a very
5701   // rough estimation. We scan the loop in a topological order in order and
5702   // assign a number to each instruction. We use RPO to ensure that defs are
5703   // met before their users. We assume that each instruction that has in-loop
5704   // users starts an interval. We record every time that an in-loop value is
5705   // used, so we have a list of the first and last occurrences of each
5706   // instruction. Next, we transpose this data structure into a multi map that
5707   // holds the list of intervals that *end* at a specific location. This multi
5708   // map allows us to perform a linear search. We scan the instructions linearly
5709   // and record each time that a new interval starts, by placing it in a set.
5710   // If we find this value in the multi-map then we remove it from the set.
5711   // The max register usage is the maximum size of the set.
5712   // We also search for instructions that are defined outside the loop, but are
5713   // used inside the loop. We need this number separately from the max-interval
5714   // usage number because when we unroll, loop-invariant values do not take
5715   // more register.
5716   LoopBlocksDFS DFS(TheLoop);
5717   DFS.perform(LI);
5718 
5719   RegisterUsage RU;
5720 
5721   // Each 'key' in the map opens a new interval. The values
5722   // of the map are the index of the 'last seen' usage of the
5723   // instruction that is the key.
5724   using IntervalMap = DenseMap<Instruction *, unsigned>;
5725 
5726   // Maps instruction to its index.
5727   SmallVector<Instruction *, 64> IdxToInstr;
5728   // Marks the end of each interval.
5729   IntervalMap EndPoint;
5730   // Saves the list of instruction indices that are used in the loop.
5731   SmallPtrSet<Instruction *, 8> Ends;
5732   // Saves the list of values that are used in the loop but are
5733   // defined outside the loop, such as arguments and constants.
5734   SmallPtrSet<Value *, 8> LoopInvariants;
5735 
5736   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5737     for (Instruction &I : BB->instructionsWithoutDebug()) {
5738       IdxToInstr.push_back(&I);
5739 
5740       // Save the end location of each USE.
5741       for (Value *U : I.operands()) {
5742         auto *Instr = dyn_cast<Instruction>(U);
5743 
5744         // Ignore non-instruction values such as arguments, constants, etc.
5745         if (!Instr)
5746           continue;
5747 
5748         // If this instruction is outside the loop then record it and continue.
5749         if (!TheLoop->contains(Instr)) {
5750           LoopInvariants.insert(Instr);
5751           continue;
5752         }
5753 
5754         // Overwrite previous end points.
5755         EndPoint[Instr] = IdxToInstr.size();
5756         Ends.insert(Instr);
5757       }
5758     }
5759   }
5760 
5761   // Saves the list of intervals that end with the index in 'key'.
5762   using InstrList = SmallVector<Instruction *, 2>;
5763   DenseMap<unsigned, InstrList> TransposeEnds;
5764 
5765   // Transpose the EndPoints to a list of values that end at each index.
5766   for (auto &Interval : EndPoint)
5767     TransposeEnds[Interval.second].push_back(Interval.first);
5768 
5769   SmallPtrSet<Instruction *, 8> OpenIntervals;
5770 
5771   // Get the size of the widest register.
5772   unsigned MaxSafeDepDist = -1U;
5773   if (Legal->getMaxSafeDepDistBytes() != -1U)
5774     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5775   unsigned WidestRegister =
5776       std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
5777   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5778 
5779   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5780   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5781 
5782   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5783 
5784   // A lambda that gets the register usage for the given type and VF.
5785   auto GetRegUsage = [&DL, WidestRegister](Type *Ty, ElementCount VF) {
5786     if (Ty->isTokenTy())
5787       return 0U;
5788     unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5789     assert(!VF.isScalable() && "scalable vectors not yet supported.");
5790     return std::max<unsigned>(1, VF.getKnownMinValue() * TypeSize /
5791                                      WidestRegister);
5792   };
5793 
5794   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5795     Instruction *I = IdxToInstr[i];
5796 
5797     // Remove all of the instructions that end at this location.
5798     InstrList &List = TransposeEnds[i];
5799     for (Instruction *ToRemove : List)
5800       OpenIntervals.erase(ToRemove);
5801 
5802     // Ignore instructions that are never used within the loop.
5803     if (!Ends.count(I))
5804       continue;
5805 
5806     // Skip ignored values.
5807     if (ValuesToIgnore.count(I))
5808       continue;
5809 
5810     // For each VF find the maximum usage of registers.
5811     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5812       // Count the number of live intervals.
5813       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5814 
5815       if (VFs[j].isScalar()) {
5816         for (auto Inst : OpenIntervals) {
5817           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5818           if (RegUsage.find(ClassID) == RegUsage.end())
5819             RegUsage[ClassID] = 1;
5820           else
5821             RegUsage[ClassID] += 1;
5822         }
5823       } else {
5824         collectUniformsAndScalars(VFs[j]);
5825         for (auto Inst : OpenIntervals) {
5826           // Skip ignored values for VF > 1.
5827           if (VecValuesToIgnore.count(Inst))
5828             continue;
5829           if (isScalarAfterVectorization(Inst, VFs[j])) {
5830             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5831             if (RegUsage.find(ClassID) == RegUsage.end())
5832               RegUsage[ClassID] = 1;
5833             else
5834               RegUsage[ClassID] += 1;
5835           } else {
5836             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
5837             if (RegUsage.find(ClassID) == RegUsage.end())
5838               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
5839             else
5840               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5841           }
5842         }
5843       }
5844 
5845       for (auto& pair : RegUsage) {
5846         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
5847           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
5848         else
5849           MaxUsages[j][pair.first] = pair.second;
5850       }
5851     }
5852 
5853     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5854                       << OpenIntervals.size() << '\n');
5855 
5856     // Add the current instruction to the list of open intervals.
5857     OpenIntervals.insert(I);
5858   }
5859 
5860   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5861     SmallMapVector<unsigned, unsigned, 4> Invariant;
5862 
5863     for (auto Inst : LoopInvariants) {
5864       unsigned Usage =
5865           VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
5866       unsigned ClassID =
5867           TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
5868       if (Invariant.find(ClassID) == Invariant.end())
5869         Invariant[ClassID] = Usage;
5870       else
5871         Invariant[ClassID] += Usage;
5872     }
5873 
5874     LLVM_DEBUG({
5875       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5876       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5877              << " item\n";
5878       for (const auto &pair : MaxUsages[i]) {
5879         dbgs() << "LV(REG): RegisterClass: "
5880                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5881                << " registers\n";
5882       }
5883       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5884              << " item\n";
5885       for (const auto &pair : Invariant) {
5886         dbgs() << "LV(REG): RegisterClass: "
5887                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5888                << " registers\n";
5889       }
5890     });
5891 
5892     RU.LoopInvariantRegs = Invariant;
5893     RU.MaxLocalUsers = MaxUsages[i];
5894     RUs[i] = RU;
5895   }
5896 
5897   return RUs;
5898 }
5899 
5900 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5901   // TODO: Cost model for emulated masked load/store is completely
5902   // broken. This hack guides the cost model to use an artificially
5903   // high enough value to practically disable vectorization with such
5904   // operations, except where previously deployed legality hack allowed
5905   // using very low cost values. This is to avoid regressions coming simply
5906   // from moving "masked load/store" check from legality to cost model.
5907   // Masked Load/Gather emulation was previously never allowed.
5908   // Limited number of Masked Store/Scatter emulation was allowed.
5909   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
5910   return isa<LoadInst>(I) ||
5911          (isa<StoreInst>(I) &&
5912           NumPredStores > NumberOfStoresToPredicate);
5913 }
5914 
5915 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
5916   // If we aren't vectorizing the loop, or if we've already collected the
5917   // instructions to scalarize, there's nothing to do. Collection may already
5918   // have occurred if we have a user-selected VF and are now computing the
5919   // expected cost for interleaving.
5920   if (VF.isScalar() || VF.isZero() ||
5921       InstsToScalarize.find(VF) != InstsToScalarize.end())
5922     return;
5923 
5924   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5925   // not profitable to scalarize any instructions, the presence of VF in the
5926   // map will indicate that we've analyzed it already.
5927   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5928 
5929   // Find all the instructions that are scalar with predication in the loop and
5930   // determine if it would be better to not if-convert the blocks they are in.
5931   // If so, we also record the instructions to scalarize.
5932   for (BasicBlock *BB : TheLoop->blocks()) {
5933     if (!blockNeedsPredication(BB))
5934       continue;
5935     for (Instruction &I : *BB)
5936       if (isScalarWithPredication(&I)) {
5937         ScalarCostsTy ScalarCosts;
5938         // Do not apply discount logic if hacked cost is needed
5939         // for emulated masked memrefs.
5940         if (!useEmulatedMaskMemRefHack(&I) &&
5941             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5942           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5943         // Remember that BB will remain after vectorization.
5944         PredicatedBBsAfterVectorization.insert(BB);
5945       }
5946   }
5947 }
5948 
5949 int LoopVectorizationCostModel::computePredInstDiscount(
5950     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5951     ElementCount VF) {
5952   assert(!isUniformAfterVectorization(PredInst, VF) &&
5953          "Instruction marked uniform-after-vectorization will be predicated");
5954 
5955   // Initialize the discount to zero, meaning that the scalar version and the
5956   // vector version cost the same.
5957   int Discount = 0;
5958 
5959   // Holds instructions to analyze. The instructions we visit are mapped in
5960   // ScalarCosts. Those instructions are the ones that would be scalarized if
5961   // we find that the scalar version costs less.
5962   SmallVector<Instruction *, 8> Worklist;
5963 
5964   // Returns true if the given instruction can be scalarized.
5965   auto canBeScalarized = [&](Instruction *I) -> bool {
5966     // We only attempt to scalarize instructions forming a single-use chain
5967     // from the original predicated block that would otherwise be vectorized.
5968     // Although not strictly necessary, we give up on instructions we know will
5969     // already be scalar to avoid traversing chains that are unlikely to be
5970     // beneficial.
5971     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5972         isScalarAfterVectorization(I, VF))
5973       return false;
5974 
5975     // If the instruction is scalar with predication, it will be analyzed
5976     // separately. We ignore it within the context of PredInst.
5977     if (isScalarWithPredication(I))
5978       return false;
5979 
5980     // If any of the instruction's operands are uniform after vectorization,
5981     // the instruction cannot be scalarized. This prevents, for example, a
5982     // masked load from being scalarized.
5983     //
5984     // We assume we will only emit a value for lane zero of an instruction
5985     // marked uniform after vectorization, rather than VF identical values.
5986     // Thus, if we scalarize an instruction that uses a uniform, we would
5987     // create uses of values corresponding to the lanes we aren't emitting code
5988     // for. This behavior can be changed by allowing getScalarValue to clone
5989     // the lane zero values for uniforms rather than asserting.
5990     for (Use &U : I->operands())
5991       if (auto *J = dyn_cast<Instruction>(U.get()))
5992         if (isUniformAfterVectorization(J, VF))
5993           return false;
5994 
5995     // Otherwise, we can scalarize the instruction.
5996     return true;
5997   };
5998 
5999   // Compute the expected cost discount from scalarizing the entire expression
6000   // feeding the predicated instruction. We currently only consider expressions
6001   // that are single-use instruction chains.
6002   Worklist.push_back(PredInst);
6003   while (!Worklist.empty()) {
6004     Instruction *I = Worklist.pop_back_val();
6005 
6006     // If we've already analyzed the instruction, there's nothing to do.
6007     if (ScalarCosts.find(I) != ScalarCosts.end())
6008       continue;
6009 
6010     // Compute the cost of the vector instruction. Note that this cost already
6011     // includes the scalarization overhead of the predicated instruction.
6012     unsigned VectorCost = getInstructionCost(I, VF).first;
6013 
6014     // Compute the cost of the scalarized instruction. This cost is the cost of
6015     // the instruction as if it wasn't if-converted and instead remained in the
6016     // predicated block. We will scale this cost by block probability after
6017     // computing the scalarization overhead.
6018     assert(!VF.isScalable() && "scalable vectors not yet supported.");
6019     unsigned ScalarCost =
6020         VF.getKnownMinValue() *
6021         getInstructionCost(I, ElementCount::getFixed(1)).first;
6022 
6023     // Compute the scalarization overhead of needed insertelement instructions
6024     // and phi nodes.
6025     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
6026       ScalarCost += TTI.getScalarizationOverhead(
6027           cast<VectorType>(ToVectorTy(I->getType(), VF)),
6028           APInt::getAllOnesValue(VF.getKnownMinValue()), true, false);
6029       assert(!VF.isScalable() && "scalable vectors not yet supported.");
6030       ScalarCost +=
6031           VF.getKnownMinValue() *
6032           TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
6033     }
6034 
6035     // Compute the scalarization overhead of needed extractelement
6036     // instructions. For each of the instruction's operands, if the operand can
6037     // be scalarized, add it to the worklist; otherwise, account for the
6038     // overhead.
6039     for (Use &U : I->operands())
6040       if (auto *J = dyn_cast<Instruction>(U.get())) {
6041         assert(VectorType::isValidElementType(J->getType()) &&
6042                "Instruction has non-scalar type");
6043         if (canBeScalarized(J))
6044           Worklist.push_back(J);
6045         else if (needsExtract(J, VF)) {
6046           assert(!VF.isScalable() && "scalable vectors not yet supported.");
6047           ScalarCost += TTI.getScalarizationOverhead(
6048               cast<VectorType>(ToVectorTy(J->getType(), VF)),
6049               APInt::getAllOnesValue(VF.getKnownMinValue()), false, true);
6050         }
6051       }
6052 
6053     // Scale the total scalar cost by block probability.
6054     ScalarCost /= getReciprocalPredBlockProb();
6055 
6056     // Compute the discount. A non-negative discount means the vector version
6057     // of the instruction costs more, and scalarizing would be beneficial.
6058     Discount += VectorCost - ScalarCost;
6059     ScalarCosts[I] = ScalarCost;
6060   }
6061 
6062   return Discount;
6063 }
6064 
6065 LoopVectorizationCostModel::VectorizationCostTy
6066 LoopVectorizationCostModel::expectedCost(ElementCount VF) {
6067   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6068   VectorizationCostTy Cost;
6069 
6070   // For each block.
6071   for (BasicBlock *BB : TheLoop->blocks()) {
6072     VectorizationCostTy BlockCost;
6073 
6074     // For each instruction in the old loop.
6075     for (Instruction &I : BB->instructionsWithoutDebug()) {
6076       // Skip ignored values.
6077       if (ValuesToIgnore.count(&I) ||
6078           (VF.isVector() && VecValuesToIgnore.count(&I)))
6079         continue;
6080 
6081       VectorizationCostTy C = getInstructionCost(&I, VF);
6082 
6083       // Check if we should override the cost.
6084       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
6085         C.first = ForceTargetInstructionCost;
6086 
6087       BlockCost.first += C.first;
6088       BlockCost.second |= C.second;
6089       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6090                         << " for VF " << VF << " For instruction: " << I
6091                         << '\n');
6092     }
6093 
6094     // If we are vectorizing a predicated block, it will have been
6095     // if-converted. This means that the block's instructions (aside from
6096     // stores and instructions that may divide by zero) will now be
6097     // unconditionally executed. For the scalar case, we may not always execute
6098     // the predicated block. Thus, scale the block's cost by the probability of
6099     // executing it.
6100     if (VF.isScalar() && blockNeedsPredication(BB))
6101       BlockCost.first /= getReciprocalPredBlockProb();
6102 
6103     Cost.first += BlockCost.first;
6104     Cost.second |= BlockCost.second;
6105   }
6106 
6107   return Cost;
6108 }
6109 
6110 /// Gets Address Access SCEV after verifying that the access pattern
6111 /// is loop invariant except the induction variable dependence.
6112 ///
6113 /// This SCEV can be sent to the Target in order to estimate the address
6114 /// calculation cost.
6115 static const SCEV *getAddressAccessSCEV(
6116               Value *Ptr,
6117               LoopVectorizationLegality *Legal,
6118               PredicatedScalarEvolution &PSE,
6119               const Loop *TheLoop) {
6120 
6121   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6122   if (!Gep)
6123     return nullptr;
6124 
6125   // We are looking for a gep with all loop invariant indices except for one
6126   // which should be an induction variable.
6127   auto SE = PSE.getSE();
6128   unsigned NumOperands = Gep->getNumOperands();
6129   for (unsigned i = 1; i < NumOperands; ++i) {
6130     Value *Opd = Gep->getOperand(i);
6131     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6132         !Legal->isInductionVariable(Opd))
6133       return nullptr;
6134   }
6135 
6136   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6137   return PSE.getSCEV(Ptr);
6138 }
6139 
6140 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6141   return Legal->hasStride(I->getOperand(0)) ||
6142          Legal->hasStride(I->getOperand(1));
6143 }
6144 
6145 unsigned
6146 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6147                                                         ElementCount VF) {
6148   assert(VF.isVector() &&
6149          "Scalarization cost of instruction implies vectorization.");
6150   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6151   Type *ValTy = getMemInstValueType(I);
6152   auto SE = PSE.getSE();
6153 
6154   unsigned AS = getLoadStoreAddressSpace(I);
6155   Value *Ptr = getLoadStorePointerOperand(I);
6156   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6157 
6158   // Figure out whether the access is strided and get the stride value
6159   // if it's known in compile time
6160   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6161 
6162   // Get the cost of the scalar memory instruction and address computation.
6163   unsigned Cost =
6164       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6165 
6166   // Don't pass *I here, since it is scalar but will actually be part of a
6167   // vectorized loop where the user of it is a vectorized instruction.
6168   const Align Alignment = getLoadStoreAlignment(I);
6169   Cost += VF.getKnownMinValue() *
6170           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
6171                               AS, TTI::TCK_RecipThroughput);
6172 
6173   // Get the overhead of the extractelement and insertelement instructions
6174   // we might create due to scalarization.
6175   Cost += getScalarizationOverhead(I, VF);
6176 
6177   // If we have a predicated store, it may not be executed for each vector
6178   // lane. Scale the cost by the probability of executing the predicated
6179   // block.
6180   if (isPredicatedInst(I)) {
6181     Cost /= getReciprocalPredBlockProb();
6182 
6183     if (useEmulatedMaskMemRefHack(I))
6184       // Artificially setting to a high enough value to practically disable
6185       // vectorization with such operations.
6186       Cost = 3000000;
6187   }
6188 
6189   return Cost;
6190 }
6191 
6192 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6193                                                              ElementCount VF) {
6194   Type *ValTy = getMemInstValueType(I);
6195   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6196   Value *Ptr = getLoadStorePointerOperand(I);
6197   unsigned AS = getLoadStoreAddressSpace(I);
6198   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
6199   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6200 
6201   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6202          "Stride should be 1 or -1 for consecutive memory access");
6203   const Align Alignment = getLoadStoreAlignment(I);
6204   unsigned Cost = 0;
6205   if (Legal->isMaskRequired(I))
6206     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6207                                       CostKind);
6208   else
6209     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6210                                 CostKind, I);
6211 
6212   bool Reverse = ConsecutiveStride < 0;
6213   if (Reverse)
6214     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6215   return Cost;
6216 }
6217 
6218 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6219                                                          ElementCount VF) {
6220   Type *ValTy = getMemInstValueType(I);
6221   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6222   const Align Alignment = getLoadStoreAlignment(I);
6223   unsigned AS = getLoadStoreAddressSpace(I);
6224   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6225   if (isa<LoadInst>(I)) {
6226     return TTI.getAddressComputationCost(ValTy) +
6227            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6228                                CostKind) +
6229            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6230   }
6231   StoreInst *SI = cast<StoreInst>(I);
6232 
6233   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6234   return TTI.getAddressComputationCost(ValTy) +
6235          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6236                              CostKind) +
6237          (isLoopInvariantStoreValue
6238               ? 0
6239               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6240                                        VF.getKnownMinValue() - 1));
6241 }
6242 
6243 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6244                                                           ElementCount VF) {
6245   Type *ValTy = getMemInstValueType(I);
6246   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6247   const Align Alignment = getLoadStoreAlignment(I);
6248   const Value *Ptr = getLoadStorePointerOperand(I);
6249 
6250   return TTI.getAddressComputationCost(VectorTy) +
6251          TTI.getGatherScatterOpCost(
6252              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6253              TargetTransformInfo::TCK_RecipThroughput, I);
6254 }
6255 
6256 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6257                                                             ElementCount VF) {
6258   Type *ValTy = getMemInstValueType(I);
6259   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6260   unsigned AS = getLoadStoreAddressSpace(I);
6261 
6262   auto Group = getInterleavedAccessGroup(I);
6263   assert(Group && "Fail to get an interleaved access group.");
6264 
6265   unsigned InterleaveFactor = Group->getFactor();
6266   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6267   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6268 
6269   // Holds the indices of existing members in an interleaved load group.
6270   // An interleaved store group doesn't need this as it doesn't allow gaps.
6271   SmallVector<unsigned, 4> Indices;
6272   if (isa<LoadInst>(I)) {
6273     for (unsigned i = 0; i < InterleaveFactor; i++)
6274       if (Group->getMember(i))
6275         Indices.push_back(i);
6276   }
6277 
6278   // Calculate the cost of the whole interleaved group.
6279   bool UseMaskForGaps =
6280       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
6281   unsigned Cost = TTI.getInterleavedMemoryOpCost(
6282       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6283       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
6284 
6285   if (Group->isReverse()) {
6286     // TODO: Add support for reversed masked interleaved access.
6287     assert(!Legal->isMaskRequired(I) &&
6288            "Reverse masked interleaved access not supported.");
6289     Cost += Group->getNumMembers() *
6290             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6291   }
6292   return Cost;
6293 }
6294 
6295 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6296                                                               ElementCount VF) {
6297   // Calculate scalar cost only. Vectorization cost should be ready at this
6298   // moment.
6299   if (VF.isScalar()) {
6300     Type *ValTy = getMemInstValueType(I);
6301     const Align Alignment = getLoadStoreAlignment(I);
6302     unsigned AS = getLoadStoreAddressSpace(I);
6303 
6304     return TTI.getAddressComputationCost(ValTy) +
6305            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6306                                TTI::TCK_RecipThroughput, I);
6307   }
6308   return getWideningCost(I, VF);
6309 }
6310 
6311 LoopVectorizationCostModel::VectorizationCostTy
6312 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6313                                                ElementCount VF) {
6314   assert(!VF.isScalable() &&
6315          "the cost model is not yet implemented for scalable vectorization");
6316   // If we know that this instruction will remain uniform, check the cost of
6317   // the scalar version.
6318   if (isUniformAfterVectorization(I, VF))
6319     VF = ElementCount::getFixed(1);
6320 
6321   if (VF.isVector() && isProfitableToScalarize(I, VF))
6322     return VectorizationCostTy(InstsToScalarize[VF][I], false);
6323 
6324   // Forced scalars do not have any scalarization overhead.
6325   auto ForcedScalar = ForcedScalars.find(VF);
6326   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6327     auto InstSet = ForcedScalar->second;
6328     if (InstSet.count(I))
6329       return VectorizationCostTy(
6330           (getInstructionCost(I, ElementCount::getFixed(1)).first *
6331            VF.getKnownMinValue()),
6332           false);
6333   }
6334 
6335   Type *VectorTy;
6336   unsigned C = getInstructionCost(I, VF, VectorTy);
6337 
6338   bool TypeNotScalarized =
6339       VF.isVector() && VectorTy->isVectorTy() &&
6340       TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue();
6341   return VectorizationCostTy(C, TypeNotScalarized);
6342 }
6343 
6344 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
6345                                                               ElementCount VF) {
6346 
6347   assert(!VF.isScalable() &&
6348          "cannot compute scalarization overhead for scalable vectorization");
6349   if (VF.isScalar())
6350     return 0;
6351 
6352   unsigned Cost = 0;
6353   Type *RetTy = ToVectorTy(I->getType(), VF);
6354   if (!RetTy->isVoidTy() &&
6355       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6356     Cost += TTI.getScalarizationOverhead(
6357         cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()),
6358         true, false);
6359 
6360   // Some targets keep addresses scalar.
6361   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6362     return Cost;
6363 
6364   // Some targets support efficient element stores.
6365   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6366     return Cost;
6367 
6368   // Collect operands to consider.
6369   CallInst *CI = dyn_cast<CallInst>(I);
6370   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
6371 
6372   // Skip operands that do not require extraction/scalarization and do not incur
6373   // any overhead.
6374   return Cost + TTI.getOperandsScalarizationOverhead(
6375                     filterExtractingOperands(Ops, VF), VF.getKnownMinValue());
6376 }
6377 
6378 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6379   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6380   if (VF.isScalar())
6381     return;
6382   NumPredStores = 0;
6383   for (BasicBlock *BB : TheLoop->blocks()) {
6384     // For each instruction in the old loop.
6385     for (Instruction &I : *BB) {
6386       Value *Ptr =  getLoadStorePointerOperand(&I);
6387       if (!Ptr)
6388         continue;
6389 
6390       // TODO: We should generate better code and update the cost model for
6391       // predicated uniform stores. Today they are treated as any other
6392       // predicated store (see added test cases in
6393       // invariant-store-vectorization.ll).
6394       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
6395         NumPredStores++;
6396 
6397       if (Legal->isUniform(Ptr) &&
6398           // Conditional loads and stores should be scalarized and predicated.
6399           // isScalarWithPredication cannot be used here since masked
6400           // gather/scatters are not considered scalar with predication.
6401           !Legal->blockNeedsPredication(I.getParent())) {
6402         // TODO: Avoid replicating loads and stores instead of
6403         // relying on instcombine to remove them.
6404         // Load: Scalar load + broadcast
6405         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6406         unsigned Cost = getUniformMemOpCost(&I, VF);
6407         setWideningDecision(&I, VF, CM_Scalarize, Cost);
6408         continue;
6409       }
6410 
6411       // We assume that widening is the best solution when possible.
6412       if (memoryInstructionCanBeWidened(&I, VF)) {
6413         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
6414         int ConsecutiveStride =
6415                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
6416         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6417                "Expected consecutive stride.");
6418         InstWidening Decision =
6419             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6420         setWideningDecision(&I, VF, Decision, Cost);
6421         continue;
6422       }
6423 
6424       // Choose between Interleaving, Gather/Scatter or Scalarization.
6425       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
6426       unsigned NumAccesses = 1;
6427       if (isAccessInterleaved(&I)) {
6428         auto Group = getInterleavedAccessGroup(&I);
6429         assert(Group && "Fail to get an interleaved access group.");
6430 
6431         // Make one decision for the whole group.
6432         if (getWideningDecision(&I, VF) != CM_Unknown)
6433           continue;
6434 
6435         NumAccesses = Group->getNumMembers();
6436         if (interleavedAccessCanBeWidened(&I, VF))
6437           InterleaveCost = getInterleaveGroupCost(&I, VF);
6438       }
6439 
6440       unsigned GatherScatterCost =
6441           isLegalGatherOrScatter(&I)
6442               ? getGatherScatterCost(&I, VF) * NumAccesses
6443               : std::numeric_limits<unsigned>::max();
6444 
6445       unsigned ScalarizationCost =
6446           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6447 
6448       // Choose better solution for the current VF,
6449       // write down this decision and use it during vectorization.
6450       unsigned Cost;
6451       InstWidening Decision;
6452       if (InterleaveCost <= GatherScatterCost &&
6453           InterleaveCost < ScalarizationCost) {
6454         Decision = CM_Interleave;
6455         Cost = InterleaveCost;
6456       } else if (GatherScatterCost < ScalarizationCost) {
6457         Decision = CM_GatherScatter;
6458         Cost = GatherScatterCost;
6459       } else {
6460         Decision = CM_Scalarize;
6461         Cost = ScalarizationCost;
6462       }
6463       // If the instructions belongs to an interleave group, the whole group
6464       // receives the same decision. The whole group receives the cost, but
6465       // the cost will actually be assigned to one instruction.
6466       if (auto Group = getInterleavedAccessGroup(&I))
6467         setWideningDecision(Group, VF, Decision, Cost);
6468       else
6469         setWideningDecision(&I, VF, Decision, Cost);
6470     }
6471   }
6472 
6473   // Make sure that any load of address and any other address computation
6474   // remains scalar unless there is gather/scatter support. This avoids
6475   // inevitable extracts into address registers, and also has the benefit of
6476   // activating LSR more, since that pass can't optimize vectorized
6477   // addresses.
6478   if (TTI.prefersVectorizedAddressing())
6479     return;
6480 
6481   // Start with all scalar pointer uses.
6482   SmallPtrSet<Instruction *, 8> AddrDefs;
6483   for (BasicBlock *BB : TheLoop->blocks())
6484     for (Instruction &I : *BB) {
6485       Instruction *PtrDef =
6486         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6487       if (PtrDef && TheLoop->contains(PtrDef) &&
6488           getWideningDecision(&I, VF) != CM_GatherScatter)
6489         AddrDefs.insert(PtrDef);
6490     }
6491 
6492   // Add all instructions used to generate the addresses.
6493   SmallVector<Instruction *, 4> Worklist;
6494   for (auto *I : AddrDefs)
6495     Worklist.push_back(I);
6496   while (!Worklist.empty()) {
6497     Instruction *I = Worklist.pop_back_val();
6498     for (auto &Op : I->operands())
6499       if (auto *InstOp = dyn_cast<Instruction>(Op))
6500         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6501             AddrDefs.insert(InstOp).second)
6502           Worklist.push_back(InstOp);
6503   }
6504 
6505   for (auto *I : AddrDefs) {
6506     if (isa<LoadInst>(I)) {
6507       // Setting the desired widening decision should ideally be handled in
6508       // by cost functions, but since this involves the task of finding out
6509       // if the loaded register is involved in an address computation, it is
6510       // instead changed here when we know this is the case.
6511       InstWidening Decision = getWideningDecision(I, VF);
6512       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6513         // Scalarize a widened load of address.
6514         setWideningDecision(
6515             I, VF, CM_Scalarize,
6516             (VF.getKnownMinValue() *
6517              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6518       else if (auto Group = getInterleavedAccessGroup(I)) {
6519         // Scalarize an interleave group of address loads.
6520         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6521           if (Instruction *Member = Group->getMember(I))
6522             setWideningDecision(
6523                 Member, VF, CM_Scalarize,
6524                 (VF.getKnownMinValue() *
6525                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6526         }
6527       }
6528     } else
6529       // Make sure I gets scalarized and a cost estimate without
6530       // scalarization overhead.
6531       ForcedScalars[VF].insert(I);
6532   }
6533 }
6534 
6535 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6536                                                         ElementCount VF,
6537                                                         Type *&VectorTy) {
6538   Type *RetTy = I->getType();
6539   if (canTruncateToMinimalBitwidth(I, VF))
6540     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6541   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
6542   auto SE = PSE.getSE();
6543   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6544 
6545   // TODO: We need to estimate the cost of intrinsic calls.
6546   switch (I->getOpcode()) {
6547   case Instruction::GetElementPtr:
6548     // We mark this instruction as zero-cost because the cost of GEPs in
6549     // vectorized code depends on whether the corresponding memory instruction
6550     // is scalarized or not. Therefore, we handle GEPs with the memory
6551     // instruction cost.
6552     return 0;
6553   case Instruction::Br: {
6554     // In cases of scalarized and predicated instructions, there will be VF
6555     // predicated blocks in the vectorized loop. Each branch around these
6556     // blocks requires also an extract of its vector compare i1 element.
6557     bool ScalarPredicatedBB = false;
6558     BranchInst *BI = cast<BranchInst>(I);
6559     if (VF.isVector() && BI->isConditional() &&
6560         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
6561          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
6562       ScalarPredicatedBB = true;
6563 
6564     if (ScalarPredicatedBB) {
6565       // Return cost for branches around scalarized and predicated blocks.
6566       assert(!VF.isScalable() && "scalable vectors not yet supported.");
6567       auto *Vec_i1Ty =
6568           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6569       return (TTI.getScalarizationOverhead(
6570                   Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()),
6571                   false, true) +
6572               (TTI.getCFInstrCost(Instruction::Br, CostKind) *
6573                VF.getKnownMinValue()));
6574     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6575       // The back-edge branch will remain, as will all scalar branches.
6576       return TTI.getCFInstrCost(Instruction::Br, CostKind);
6577     else
6578       // This branch will be eliminated by if-conversion.
6579       return 0;
6580     // Note: We currently assume zero cost for an unconditional branch inside
6581     // a predicated block since it will become a fall-through, although we
6582     // may decide in the future to call TTI for all branches.
6583   }
6584   case Instruction::PHI: {
6585     auto *Phi = cast<PHINode>(I);
6586 
6587     // First-order recurrences are replaced by vector shuffles inside the loop.
6588     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6589     if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
6590       return TTI.getShuffleCost(
6591           TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
6592           VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
6593 
6594     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6595     // converted into select instructions. We require N - 1 selects per phi
6596     // node, where N is the number of incoming values.
6597     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
6598       return (Phi->getNumIncomingValues() - 1) *
6599              TTI.getCmpSelInstrCost(
6600                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6601                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6602                  CostKind);
6603 
6604     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6605   }
6606   case Instruction::UDiv:
6607   case Instruction::SDiv:
6608   case Instruction::URem:
6609   case Instruction::SRem:
6610     // If we have a predicated instruction, it may not be executed for each
6611     // vector lane. Get the scalarization cost and scale this amount by the
6612     // probability of executing the predicated block. If the instruction is not
6613     // predicated, we fall through to the next case.
6614     if (VF.isVector() && isScalarWithPredication(I)) {
6615       unsigned Cost = 0;
6616 
6617       // These instructions have a non-void type, so account for the phi nodes
6618       // that we will create. This cost is likely to be zero. The phi node
6619       // cost, if any, should be scaled by the block probability because it
6620       // models a copy at the end of each predicated block.
6621       Cost += VF.getKnownMinValue() *
6622               TTI.getCFInstrCost(Instruction::PHI, CostKind);
6623 
6624       // The cost of the non-predicated instruction.
6625       Cost += VF.getKnownMinValue() *
6626               TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
6627 
6628       // The cost of insertelement and extractelement instructions needed for
6629       // scalarization.
6630       Cost += getScalarizationOverhead(I, VF);
6631 
6632       // Scale the cost by the probability of executing the predicated blocks.
6633       // This assumes the predicated block for each vector lane is equally
6634       // likely.
6635       return Cost / getReciprocalPredBlockProb();
6636     }
6637     LLVM_FALLTHROUGH;
6638   case Instruction::Add:
6639   case Instruction::FAdd:
6640   case Instruction::Sub:
6641   case Instruction::FSub:
6642   case Instruction::Mul:
6643   case Instruction::FMul:
6644   case Instruction::FDiv:
6645   case Instruction::FRem:
6646   case Instruction::Shl:
6647   case Instruction::LShr:
6648   case Instruction::AShr:
6649   case Instruction::And:
6650   case Instruction::Or:
6651   case Instruction::Xor: {
6652     // Since we will replace the stride by 1 the multiplication should go away.
6653     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
6654       return 0;
6655     // Certain instructions can be cheaper to vectorize if they have a constant
6656     // second vector operand. One example of this are shifts on x86.
6657     Value *Op2 = I->getOperand(1);
6658     TargetTransformInfo::OperandValueProperties Op2VP;
6659     TargetTransformInfo::OperandValueKind Op2VK =
6660         TTI.getOperandInfo(Op2, Op2VP);
6661     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
6662       Op2VK = TargetTransformInfo::OK_UniformValue;
6663 
6664     SmallVector<const Value *, 4> Operands(I->operand_values());
6665     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
6666     return N * TTI.getArithmeticInstrCost(
6667                    I->getOpcode(), VectorTy, CostKind,
6668                    TargetTransformInfo::OK_AnyValue,
6669                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
6670   }
6671   case Instruction::FNeg: {
6672     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
6673     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
6674     return N * TTI.getArithmeticInstrCost(
6675                    I->getOpcode(), VectorTy, CostKind,
6676                    TargetTransformInfo::OK_AnyValue,
6677                    TargetTransformInfo::OK_AnyValue,
6678                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
6679                    I->getOperand(0), I);
6680   }
6681   case Instruction::Select: {
6682     SelectInst *SI = cast<SelectInst>(I);
6683     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6684     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6685     Type *CondTy = SI->getCondition()->getType();
6686     if (!ScalarCond) {
6687       assert(!VF.isScalable() && "VF is assumed to be non scalable.");
6688       CondTy = VectorType::get(CondTy, VF);
6689     }
6690     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
6691                                   CostKind, I);
6692   }
6693   case Instruction::ICmp:
6694   case Instruction::FCmp: {
6695     Type *ValTy = I->getOperand(0)->getType();
6696     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6697     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6698       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6699     VectorTy = ToVectorTy(ValTy, VF);
6700     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, CostKind,
6701                                   I);
6702   }
6703   case Instruction::Store:
6704   case Instruction::Load: {
6705     ElementCount Width = VF;
6706     if (Width.isVector()) {
6707       InstWidening Decision = getWideningDecision(I, Width);
6708       assert(Decision != CM_Unknown &&
6709              "CM decision should be taken at this point");
6710       if (Decision == CM_Scalarize)
6711         Width = ElementCount::getFixed(1);
6712     }
6713     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
6714     return getMemoryInstructionCost(I, VF);
6715   }
6716   case Instruction::ZExt:
6717   case Instruction::SExt:
6718   case Instruction::FPToUI:
6719   case Instruction::FPToSI:
6720   case Instruction::FPExt:
6721   case Instruction::PtrToInt:
6722   case Instruction::IntToPtr:
6723   case Instruction::SIToFP:
6724   case Instruction::UIToFP:
6725   case Instruction::Trunc:
6726   case Instruction::FPTrunc:
6727   case Instruction::BitCast: {
6728     // Computes the CastContextHint from a Load/Store instruction.
6729     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6730       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
6731              "Expected a load or a store!");
6732 
6733       if (VF.isScalar() || !TheLoop->contains(I))
6734         return TTI::CastContextHint::Normal;
6735 
6736       switch (getWideningDecision(I, VF)) {
6737       case LoopVectorizationCostModel::CM_GatherScatter:
6738         return TTI::CastContextHint::GatherScatter;
6739       case LoopVectorizationCostModel::CM_Interleave:
6740         return TTI::CastContextHint::Interleave;
6741       case LoopVectorizationCostModel::CM_Scalarize:
6742       case LoopVectorizationCostModel::CM_Widen:
6743         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
6744                                         : TTI::CastContextHint::Normal;
6745       case LoopVectorizationCostModel::CM_Widen_Reverse:
6746         return TTI::CastContextHint::Reversed;
6747       case LoopVectorizationCostModel::CM_Unknown:
6748         llvm_unreachable("Instr did not go through cost modelling?");
6749       }
6750 
6751       llvm_unreachable("Unhandled case!");
6752     };
6753 
6754     unsigned Opcode = I->getOpcode();
6755     TTI::CastContextHint CCH = TTI::CastContextHint::None;
6756     // For Trunc, the context is the only user, which must be a StoreInst.
6757     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
6758       if (I->hasOneUse())
6759         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
6760           CCH = ComputeCCH(Store);
6761     }
6762     // For Z/Sext, the context is the operand, which must be a LoadInst.
6763     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
6764              Opcode == Instruction::FPExt) {
6765       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
6766         CCH = ComputeCCH(Load);
6767     }
6768 
6769     // We optimize the truncation of induction variables having constant
6770     // integer steps. The cost of these truncations is the same as the scalar
6771     // operation.
6772     if (isOptimizableIVTruncate(I, VF)) {
6773       auto *Trunc = cast<TruncInst>(I);
6774       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6775                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
6776     }
6777 
6778     Type *SrcScalarTy = I->getOperand(0)->getType();
6779     Type *SrcVecTy =
6780         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6781     if (canTruncateToMinimalBitwidth(I, VF)) {
6782       // This cast is going to be shrunk. This may remove the cast or it might
6783       // turn it into slightly different cast. For example, if MinBW == 16,
6784       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6785       //
6786       // Calculate the modified src and dest types.
6787       Type *MinVecTy = VectorTy;
6788       if (Opcode == Instruction::Trunc) {
6789         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6790         VectorTy =
6791             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6792       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
6793         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6794         VectorTy =
6795             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6796       }
6797     }
6798 
6799     assert(!VF.isScalable() && "VF is assumed to be non scalable");
6800     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
6801     return N *
6802            TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
6803   }
6804   case Instruction::Call: {
6805     bool NeedToScalarize;
6806     CallInst *CI = cast<CallInst>(I);
6807     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6808     if (getVectorIntrinsicIDForCall(CI, TLI))
6809       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6810     return CallCost;
6811   }
6812   default:
6813     // The cost of executing VF copies of the scalar instruction. This opcode
6814     // is unknown. Assume that it is the same as 'mul'.
6815     return VF.getKnownMinValue() * TTI.getArithmeticInstrCost(
6816                                        Instruction::Mul, VectorTy, CostKind) +
6817            getScalarizationOverhead(I, VF);
6818   } // end of switch.
6819 }
6820 
6821 char LoopVectorize::ID = 0;
6822 
6823 static const char lv_name[] = "Loop Vectorization";
6824 
6825 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
6826 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
6827 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
6828 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
6829 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
6830 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6831 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
6832 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6833 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
6834 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
6835 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
6836 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
6837 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
6838 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
6839 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
6840 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
6841 
6842 namespace llvm {
6843 
6844 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6845 
6846 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6847                               bool VectorizeOnlyWhenForced) {
6848   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6849 }
6850 
6851 } // end namespace llvm
6852 
6853 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6854   // Check if the pointer operand of a load or store instruction is
6855   // consecutive.
6856   if (auto *Ptr = getLoadStorePointerOperand(Inst))
6857     return Legal->isConsecutivePtr(Ptr);
6858   return false;
6859 }
6860 
6861 void LoopVectorizationCostModel::collectValuesToIgnore() {
6862   // Ignore ephemeral values.
6863   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6864 
6865   // Ignore type-promoting instructions we identified during reduction
6866   // detection.
6867   for (auto &Reduction : Legal->getReductionVars()) {
6868     RecurrenceDescriptor &RedDes = Reduction.second;
6869     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6870     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6871   }
6872   // Ignore type-casting instructions we identified during induction
6873   // detection.
6874   for (auto &Induction : Legal->getInductionVars()) {
6875     InductionDescriptor &IndDes = Induction.second;
6876     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6877     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6878   }
6879 }
6880 
6881 void LoopVectorizationCostModel::collectInLoopReductions() {
6882   for (auto &Reduction : Legal->getReductionVars()) {
6883     PHINode *Phi = Reduction.first;
6884     RecurrenceDescriptor &RdxDesc = Reduction.second;
6885 
6886     // We don't collect reductions that are type promoted (yet).
6887     if (RdxDesc.getRecurrenceType() != Phi->getType())
6888       continue;
6889 
6890     // If the target would prefer this reduction to happen "in-loop", then we
6891     // want to record it as such.
6892     unsigned Opcode = RdxDesc.getRecurrenceBinOp(RdxDesc.getRecurrenceKind());
6893     if (!PreferInLoopReductions &&
6894         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
6895                                    TargetTransformInfo::ReductionFlags()))
6896       continue;
6897 
6898     // Check that we can correctly put the reductions into the loop, by
6899     // finding the chain of operations that leads from the phi to the loop
6900     // exit value.
6901     SmallVector<Instruction *, 4> ReductionOperations =
6902         RdxDesc.getReductionOpChain(Phi, TheLoop);
6903     bool InLoop = !ReductionOperations.empty();
6904     if (InLoop)
6905       InLoopReductionChains[Phi] = ReductionOperations;
6906     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
6907                       << " reduction for phi: " << *Phi << "\n");
6908   }
6909 }
6910 
6911 // TODO: we could return a pair of values that specify the max VF and
6912 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6913 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6914 // doesn't have a cost model that can choose which plan to execute if
6915 // more than one is generated.
6916 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6917                                  LoopVectorizationCostModel &CM) {
6918   unsigned WidestType;
6919   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6920   return WidestVectorRegBits / WidestType;
6921 }
6922 
6923 VectorizationFactor
6924 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
6925   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
6926   ElementCount VF = UserVF;
6927   // Outer loop handling: They may require CFG and instruction level
6928   // transformations before even evaluating whether vectorization is profitable.
6929   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6930   // the vectorization pipeline.
6931   if (!OrigLoop->isInnermost()) {
6932     // If the user doesn't provide a vectorization factor, determine a
6933     // reasonable one.
6934     if (UserVF.isZero()) {
6935       VF = ElementCount::getFixed(
6936           determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM));
6937       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6938 
6939       // Make sure we have a VF > 1 for stress testing.
6940       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
6941         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6942                           << "overriding computed VF.\n");
6943         VF = ElementCount::getFixed(4);
6944       }
6945     }
6946     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6947     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
6948            "VF needs to be a power of two");
6949     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
6950                       << "VF " << VF << " to build VPlans.\n");
6951     buildVPlans(VF.getKnownMinValue(), VF.getKnownMinValue());
6952 
6953     // For VPlan build stress testing, we bail out after VPlan construction.
6954     if (VPlanBuildStressTest)
6955       return VectorizationFactor::Disabled();
6956 
6957     return {VF, 0 /*Cost*/};
6958   }
6959 
6960   LLVM_DEBUG(
6961       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6962                 "VPlan-native path.\n");
6963   return VectorizationFactor::Disabled();
6964 }
6965 
6966 Optional<VectorizationFactor>
6967 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
6968   assert(!UserVF.isScalable() && "scalable vectorization not yet handled");
6969   assert(OrigLoop->isInnermost() && "Inner loop expected.");
6970   Optional<unsigned> MaybeMaxVF =
6971       CM.computeMaxVF(UserVF.getKnownMinValue(), UserIC);
6972   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
6973     return None;
6974 
6975   // Invalidate interleave groups if all blocks of loop will be predicated.
6976   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
6977       !useMaskedInterleavedAccesses(*TTI)) {
6978     LLVM_DEBUG(
6979         dbgs()
6980         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6981            "which requires masked-interleaved support.\n");
6982     if (CM.InterleaveInfo.invalidateGroups())
6983       // Invalidating interleave groups also requires invalidating all decisions
6984       // based on them, which includes widening decisions and uniform and scalar
6985       // values.
6986       CM.invalidateCostModelingDecisions();
6987   }
6988 
6989   if (!UserVF.isZero()) {
6990     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6991     assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
6992            "VF needs to be a power of two");
6993     // Collect the instructions (and their associated costs) that will be more
6994     // profitable to scalarize.
6995     CM.selectUserVectorizationFactor(UserVF);
6996     CM.collectInLoopReductions();
6997     buildVPlansWithVPRecipes(UserVF.getKnownMinValue(),
6998                              UserVF.getKnownMinValue());
6999     LLVM_DEBUG(printPlans(dbgs()));
7000     return {{UserVF, 0}};
7001   }
7002 
7003   unsigned MaxVF = MaybeMaxVF.getValue();
7004   assert(MaxVF != 0 && "MaxVF is zero.");
7005 
7006   for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
7007     // Collect Uniform and Scalar instructions after vectorization with VF.
7008     CM.collectUniformsAndScalars(ElementCount::getFixed(VF));
7009 
7010     // Collect the instructions (and their associated costs) that will be more
7011     // profitable to scalarize.
7012     if (VF > 1)
7013       CM.collectInstsToScalarize(ElementCount::getFixed(VF));
7014   }
7015 
7016   CM.collectInLoopReductions();
7017 
7018   buildVPlansWithVPRecipes(1, MaxVF);
7019   LLVM_DEBUG(printPlans(dbgs()));
7020   if (MaxVF == 1)
7021     return VectorizationFactor::Disabled();
7022 
7023   // Select the optimal vectorization factor.
7024   return CM.selectVectorizationFactor(MaxVF);
7025 }
7026 
7027 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) {
7028   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
7029                     << '\n');
7030   BestVF = VF;
7031   BestUF = UF;
7032 
7033   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
7034     return !Plan->hasVF(VF);
7035   });
7036   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
7037 }
7038 
7039 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
7040                                            DominatorTree *DT) {
7041   // Perform the actual loop transformation.
7042 
7043   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
7044   VPCallbackILV CallbackILV(ILV);
7045 
7046   assert(BestVF.hasValue() && "Vectorization Factor is missing");
7047 
7048   VPTransformState State{*BestVF, BestUF,      LI,
7049                          DT,      ILV.Builder, ILV.VectorLoopValueMap,
7050                          &ILV,    CallbackILV};
7051   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
7052   State.TripCount = ILV.getOrCreateTripCount(nullptr);
7053   State.CanonicalIV = ILV.Induction;
7054 
7055   //===------------------------------------------------===//
7056   //
7057   // Notice: any optimization or new instruction that go
7058   // into the code below should also be implemented in
7059   // the cost-model.
7060   //
7061   //===------------------------------------------------===//
7062 
7063   // 2. Copy and widen instructions from the old loop into the new loop.
7064   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
7065   VPlans.front()->execute(&State);
7066 
7067   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7068   //    predication, updating analyses.
7069   ILV.fixVectorizedLoop();
7070 }
7071 
7072 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
7073     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7074   BasicBlock *Latch = OrigLoop->getLoopLatch();
7075 
7076   // We create new control-flow for the vectorized loop, so the original
7077   // condition will be dead after vectorization if it's only used by the
7078   // branch.
7079   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
7080   if (Cmp && Cmp->hasOneUse()) {
7081     DeadInstructions.insert(Cmp);
7082 
7083     // The operands of the icmp is often a dead trunc, used by IndUpdate.
7084     for (Value *Op : Cmp->operands()) {
7085       if (isa<TruncInst>(Op) && Op->hasOneUse())
7086           DeadInstructions.insert(cast<Instruction>(Op));
7087     }
7088   }
7089 
7090   // We create new "steps" for induction variable updates to which the original
7091   // induction variables map. An original update instruction will be dead if
7092   // all its users except the induction variable are dead.
7093   for (auto &Induction : Legal->getInductionVars()) {
7094     PHINode *Ind = Induction.first;
7095     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
7096     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
7097           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
7098         }))
7099       DeadInstructions.insert(IndUpdate);
7100 
7101     // We record as "Dead" also the type-casting instructions we had identified
7102     // during induction analysis. We don't need any handling for them in the
7103     // vectorized loop because we have proven that, under a proper runtime
7104     // test guarding the vectorized loop, the value of the phi, and the casted
7105     // value of the phi, are the same. The last instruction in this casting chain
7106     // will get its scalar/vector/widened def from the scalar/vector/widened def
7107     // of the respective phi node. Any other casts in the induction def-use chain
7108     // have no other uses outside the phi update chain, and will be ignored.
7109     InductionDescriptor &IndDes = Induction.second;
7110     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7111     DeadInstructions.insert(Casts.begin(), Casts.end());
7112   }
7113 }
7114 
7115 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
7116 
7117 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
7118 
7119 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
7120                                         Instruction::BinaryOps BinOp) {
7121   // When unrolling and the VF is 1, we only need to add a simple scalar.
7122   Type *Ty = Val->getType();
7123   assert(!Ty->isVectorTy() && "Val must be a scalar");
7124 
7125   if (Ty->isFloatingPointTy()) {
7126     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
7127 
7128     // Floating point operations had to be 'fast' to enable the unrolling.
7129     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
7130     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
7131   }
7132   Constant *C = ConstantInt::get(Ty, StartIdx);
7133   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
7134 }
7135 
7136 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7137   SmallVector<Metadata *, 4> MDs;
7138   // Reserve first location for self reference to the LoopID metadata node.
7139   MDs.push_back(nullptr);
7140   bool IsUnrollMetadata = false;
7141   MDNode *LoopID = L->getLoopID();
7142   if (LoopID) {
7143     // First find existing loop unrolling disable metadata.
7144     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7145       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7146       if (MD) {
7147         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7148         IsUnrollMetadata =
7149             S && S->getString().startswith("llvm.loop.unroll.disable");
7150       }
7151       MDs.push_back(LoopID->getOperand(i));
7152     }
7153   }
7154 
7155   if (!IsUnrollMetadata) {
7156     // Add runtime unroll disable metadata.
7157     LLVMContext &Context = L->getHeader()->getContext();
7158     SmallVector<Metadata *, 1> DisableOperands;
7159     DisableOperands.push_back(
7160         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7161     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7162     MDs.push_back(DisableNode);
7163     MDNode *NewLoopID = MDNode::get(Context, MDs);
7164     // Set operand 0 to refer to the loop id itself.
7165     NewLoopID->replaceOperandWith(0, NewLoopID);
7166     L->setLoopID(NewLoopID);
7167   }
7168 }
7169 
7170 bool LoopVectorizationPlanner::getDecisionAndClampRange(
7171     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
7172   assert(Range.End > Range.Start && "Trying to test an empty VF range.");
7173   bool PredicateAtRangeStart = Predicate(ElementCount::getFixed(Range.Start));
7174 
7175   for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
7176     if (Predicate(ElementCount::getFixed(TmpVF)) != PredicateAtRangeStart) {
7177       Range.End = TmpVF;
7178       break;
7179     }
7180 
7181   return PredicateAtRangeStart;
7182 }
7183 
7184 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
7185 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
7186 /// of VF's starting at a given VF and extending it as much as possible. Each
7187 /// vectorization decision can potentially shorten this sub-range during
7188 /// buildVPlan().
7189 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
7190   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
7191     VFRange SubRange = {VF, MaxVF + 1};
7192     VPlans.push_back(buildVPlan(SubRange));
7193     VF = SubRange.End;
7194   }
7195 }
7196 
7197 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
7198                                          VPlanPtr &Plan) {
7199   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
7200 
7201   // Look for cached value.
7202   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
7203   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
7204   if (ECEntryIt != EdgeMaskCache.end())
7205     return ECEntryIt->second;
7206 
7207   VPValue *SrcMask = createBlockInMask(Src, Plan);
7208 
7209   // The terminator has to be a branch inst!
7210   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
7211   assert(BI && "Unexpected terminator found");
7212 
7213   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
7214     return EdgeMaskCache[Edge] = SrcMask;
7215 
7216   VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
7217   assert(EdgeMask && "No Edge Mask found for condition");
7218 
7219   if (BI->getSuccessor(0) != Dst)
7220     EdgeMask = Builder.createNot(EdgeMask);
7221 
7222   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
7223     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
7224 
7225   return EdgeMaskCache[Edge] = EdgeMask;
7226 }
7227 
7228 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
7229   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
7230 
7231   // Look for cached value.
7232   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
7233   if (BCEntryIt != BlockMaskCache.end())
7234     return BCEntryIt->second;
7235 
7236   // All-one mask is modelled as no-mask following the convention for masked
7237   // load/store/gather/scatter. Initialize BlockMask to no-mask.
7238   VPValue *BlockMask = nullptr;
7239 
7240   if (OrigLoop->getHeader() == BB) {
7241     if (!CM.blockNeedsPredication(BB))
7242       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
7243 
7244     // Create the block in mask as the first non-phi instruction in the block.
7245     VPBuilder::InsertPointGuard Guard(Builder);
7246     auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi();
7247     Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint);
7248 
7249     // Introduce the early-exit compare IV <= BTC to form header block mask.
7250     // This is used instead of IV < TC because TC may wrap, unlike BTC.
7251     // Start by constructing the desired canonical IV.
7252     VPValue *IV = nullptr;
7253     if (Legal->getPrimaryInduction())
7254       IV = Plan->getVPValue(Legal->getPrimaryInduction());
7255     else {
7256       auto IVRecipe = new VPWidenCanonicalIVRecipe();
7257       Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint);
7258       IV = IVRecipe->getVPValue();
7259     }
7260     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
7261     bool TailFolded = !CM.isScalarEpilogueAllowed();
7262 
7263     if (TailFolded && CM.TTI.emitGetActiveLaneMask()) {
7264       // While ActiveLaneMask is a binary op that consumes the loop tripcount
7265       // as a second argument, we only pass the IV here and extract the
7266       // tripcount from the transform state where codegen of the VP instructions
7267       // happen.
7268       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV});
7269     } else {
7270       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
7271     }
7272     return BlockMaskCache[BB] = BlockMask;
7273   }
7274 
7275   // This is the block mask. We OR all incoming edges.
7276   for (auto *Predecessor : predecessors(BB)) {
7277     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
7278     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
7279       return BlockMaskCache[BB] = EdgeMask;
7280 
7281     if (!BlockMask) { // BlockMask has its initialized nullptr value.
7282       BlockMask = EdgeMask;
7283       continue;
7284     }
7285 
7286     BlockMask = Builder.createOr(BlockMask, EdgeMask);
7287   }
7288 
7289   return BlockMaskCache[BB] = BlockMask;
7290 }
7291 
7292 VPWidenMemoryInstructionRecipe *
7293 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
7294                                   VPlanPtr &Plan) {
7295   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7296          "Must be called with either a load or store");
7297 
7298   auto willWiden = [&](ElementCount VF) -> bool {
7299     assert(!VF.isScalable() && "unexpected scalable ElementCount");
7300     if (VF.isScalar())
7301       return false;
7302     LoopVectorizationCostModel::InstWidening Decision =
7303         CM.getWideningDecision(I, VF);
7304     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
7305            "CM decision should be taken at this point.");
7306     if (Decision == LoopVectorizationCostModel::CM_Interleave)
7307       return true;
7308     if (CM.isScalarAfterVectorization(I, VF) ||
7309         CM.isProfitableToScalarize(I, VF))
7310       return false;
7311     return Decision != LoopVectorizationCostModel::CM_Scalarize;
7312   };
7313 
7314   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
7315     return nullptr;
7316 
7317   VPValue *Mask = nullptr;
7318   if (Legal->isMaskRequired(I))
7319     Mask = createBlockInMask(I->getParent(), Plan);
7320 
7321   VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
7322   if (LoadInst *Load = dyn_cast<LoadInst>(I))
7323     return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask);
7324 
7325   StoreInst *Store = cast<StoreInst>(I);
7326   VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand());
7327   return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask);
7328 }
7329 
7330 VPWidenIntOrFpInductionRecipe *
7331 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const {
7332   // Check if this is an integer or fp induction. If so, build the recipe that
7333   // produces its scalar and vector values.
7334   InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
7335   if (II.getKind() == InductionDescriptor::IK_IntInduction ||
7336       II.getKind() == InductionDescriptor::IK_FpInduction)
7337     return new VPWidenIntOrFpInductionRecipe(Phi);
7338 
7339   return nullptr;
7340 }
7341 
7342 VPWidenIntOrFpInductionRecipe *
7343 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I,
7344                                                 VFRange &Range) const {
7345   // Optimize the special case where the source is a constant integer
7346   // induction variable. Notice that we can only optimize the 'trunc' case
7347   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
7348   // (c) other casts depend on pointer size.
7349 
7350   // Determine whether \p K is a truncation based on an induction variable that
7351   // can be optimized.
7352   auto isOptimizableIVTruncate =
7353       [&](Instruction *K) -> std::function<bool(ElementCount)> {
7354     return [=](ElementCount VF) -> bool {
7355       return CM.isOptimizableIVTruncate(K, VF);
7356     };
7357   };
7358 
7359   if (LoopVectorizationPlanner::getDecisionAndClampRange(
7360           isOptimizableIVTruncate(I), Range))
7361     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
7362                                              I);
7363   return nullptr;
7364 }
7365 
7366 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) {
7367   // We know that all PHIs in non-header blocks are converted into selects, so
7368   // we don't have to worry about the insertion order and we can just use the
7369   // builder. At this point we generate the predication tree. There may be
7370   // duplications since this is a simple recursive scan, but future
7371   // optimizations will clean it up.
7372 
7373   SmallVector<VPValue *, 2> Operands;
7374   unsigned NumIncoming = Phi->getNumIncomingValues();
7375   for (unsigned In = 0; In < NumIncoming; In++) {
7376     VPValue *EdgeMask =
7377       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
7378     assert((EdgeMask || NumIncoming == 1) &&
7379            "Multiple predecessors with one having a full mask");
7380     Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In)));
7381     if (EdgeMask)
7382       Operands.push_back(EdgeMask);
7383   }
7384   return new VPBlendRecipe(Phi, Operands);
7385 }
7386 
7387 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range,
7388                                                    VPlan &Plan) const {
7389 
7390   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
7391       [this, CI](ElementCount VF) {
7392         return CM.isScalarWithPredication(CI, VF);
7393       },
7394       Range);
7395 
7396   if (IsPredicated)
7397     return nullptr;
7398 
7399   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7400   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
7401              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
7402     return nullptr;
7403 
7404   auto willWiden = [&](ElementCount VF) -> bool {
7405     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7406     // The following case may be scalarized depending on the VF.
7407     // The flag shows whether we use Intrinsic or a usual Call for vectorized
7408     // version of the instruction.
7409     // Is it beneficial to perform intrinsic call compared to lib call?
7410     bool NeedToScalarize = false;
7411     unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
7412     bool UseVectorIntrinsic =
7413         ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
7414     return UseVectorIntrinsic || !NeedToScalarize;
7415   };
7416 
7417   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
7418     return nullptr;
7419 
7420   return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands()));
7421 }
7422 
7423 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
7424   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
7425          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
7426   // Instruction should be widened, unless it is scalar after vectorization,
7427   // scalarization is profitable or it is predicated.
7428   auto WillScalarize = [this, I](ElementCount VF) -> bool {
7429     return CM.isScalarAfterVectorization(I, VF) ||
7430            CM.isProfitableToScalarize(I, VF) ||
7431            CM.isScalarWithPredication(I, VF);
7432   };
7433   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
7434                                                              Range);
7435 }
7436 
7437 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const {
7438   auto IsVectorizableOpcode = [](unsigned Opcode) {
7439     switch (Opcode) {
7440     case Instruction::Add:
7441     case Instruction::And:
7442     case Instruction::AShr:
7443     case Instruction::BitCast:
7444     case Instruction::FAdd:
7445     case Instruction::FCmp:
7446     case Instruction::FDiv:
7447     case Instruction::FMul:
7448     case Instruction::FNeg:
7449     case Instruction::FPExt:
7450     case Instruction::FPToSI:
7451     case Instruction::FPToUI:
7452     case Instruction::FPTrunc:
7453     case Instruction::FRem:
7454     case Instruction::FSub:
7455     case Instruction::ICmp:
7456     case Instruction::IntToPtr:
7457     case Instruction::LShr:
7458     case Instruction::Mul:
7459     case Instruction::Or:
7460     case Instruction::PtrToInt:
7461     case Instruction::SDiv:
7462     case Instruction::Select:
7463     case Instruction::SExt:
7464     case Instruction::Shl:
7465     case Instruction::SIToFP:
7466     case Instruction::SRem:
7467     case Instruction::Sub:
7468     case Instruction::Trunc:
7469     case Instruction::UDiv:
7470     case Instruction::UIToFP:
7471     case Instruction::URem:
7472     case Instruction::Xor:
7473     case Instruction::ZExt:
7474       return true;
7475     }
7476     return false;
7477   };
7478 
7479   if (!IsVectorizableOpcode(I->getOpcode()))
7480     return nullptr;
7481 
7482   // Success: widen this instruction.
7483   return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands()));
7484 }
7485 
7486 VPBasicBlock *VPRecipeBuilder::handleReplication(
7487     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
7488     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
7489     VPlanPtr &Plan) {
7490   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
7491       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
7492       Range);
7493 
7494   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
7495       [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); },
7496       Range);
7497 
7498   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
7499                                        IsUniform, IsPredicated);
7500   setRecipe(I, Recipe);
7501 
7502   // Find if I uses a predicated instruction. If so, it will use its scalar
7503   // value. Avoid hoisting the insert-element which packs the scalar value into
7504   // a vector value, as that happens iff all users use the vector value.
7505   for (auto &Op : I->operands())
7506     if (auto *PredInst = dyn_cast<Instruction>(Op))
7507       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
7508         PredInst2Recipe[PredInst]->setAlsoPack(false);
7509 
7510   // Finalize the recipe for Instr, first if it is not predicated.
7511   if (!IsPredicated) {
7512     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
7513     VPBB->appendRecipe(Recipe);
7514     return VPBB;
7515   }
7516   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
7517   assert(VPBB->getSuccessors().empty() &&
7518          "VPBB has successors when handling predicated replication.");
7519   // Record predicated instructions for above packing optimizations.
7520   PredInst2Recipe[I] = Recipe;
7521   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
7522   VPBlockUtils::insertBlockAfter(Region, VPBB);
7523   auto *RegSucc = new VPBasicBlock();
7524   VPBlockUtils::insertBlockAfter(RegSucc, Region);
7525   return RegSucc;
7526 }
7527 
7528 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
7529                                                       VPRecipeBase *PredRecipe,
7530                                                       VPlanPtr &Plan) {
7531   // Instructions marked for predication are replicated and placed under an
7532   // if-then construct to prevent side-effects.
7533 
7534   // Generate recipes to compute the block mask for this region.
7535   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
7536 
7537   // Build the triangular if-then region.
7538   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
7539   assert(Instr->getParent() && "Predicated instruction not in any basic block");
7540   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
7541   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
7542   auto *PHIRecipe =
7543       Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
7544   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
7545   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
7546   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
7547 
7548   // Note: first set Entry as region entry and then connect successors starting
7549   // from it in order, to propagate the "parent" of each VPBasicBlock.
7550   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
7551   VPBlockUtils::connectBlocks(Pred, Exit);
7552 
7553   return Region;
7554 }
7555 
7556 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
7557                                                       VFRange &Range,
7558                                                       VPlanPtr &Plan) {
7559   // First, check for specific widening recipes that deal with calls, memory
7560   // operations, inductions and Phi nodes.
7561   if (auto *CI = dyn_cast<CallInst>(Instr))
7562     return tryToWidenCall(CI, Range, *Plan);
7563 
7564   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
7565     return tryToWidenMemory(Instr, Range, Plan);
7566 
7567   VPRecipeBase *Recipe;
7568   if (auto Phi = dyn_cast<PHINode>(Instr)) {
7569     if (Phi->getParent() != OrigLoop->getHeader())
7570       return tryToBlend(Phi, Plan);
7571     if ((Recipe = tryToOptimizeInductionPHI(Phi)))
7572       return Recipe;
7573     return new VPWidenPHIRecipe(Phi);
7574   }
7575 
7576   if (isa<TruncInst>(Instr) &&
7577       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range)))
7578     return Recipe;
7579 
7580   if (!shouldWiden(Instr, Range))
7581     return nullptr;
7582 
7583   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
7584     return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()),
7585                                 OrigLoop);
7586 
7587   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
7588     bool InvariantCond =
7589         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
7590     return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()),
7591                                    InvariantCond);
7592   }
7593 
7594   return tryToWiden(Instr, *Plan);
7595 }
7596 
7597 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
7598                                                         unsigned MaxVF) {
7599   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7600 
7601   // Collect conditions feeding internal conditional branches; they need to be
7602   // represented in VPlan for it to model masking.
7603   SmallPtrSet<Value *, 1> NeedDef;
7604 
7605   auto *Latch = OrigLoop->getLoopLatch();
7606   for (BasicBlock *BB : OrigLoop->blocks()) {
7607     if (BB == Latch)
7608       continue;
7609     BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
7610     if (Branch && Branch->isConditional())
7611       NeedDef.insert(Branch->getCondition());
7612   }
7613 
7614   // If the tail is to be folded by masking, the primary induction variable, if
7615   // exists needs to be represented in VPlan for it to model early-exit masking.
7616   // Also, both the Phi and the live-out instruction of each reduction are
7617   // required in order to introduce a select between them in VPlan.
7618   if (CM.foldTailByMasking()) {
7619     if (Legal->getPrimaryInduction())
7620       NeedDef.insert(Legal->getPrimaryInduction());
7621     for (auto &Reduction : Legal->getReductionVars()) {
7622       NeedDef.insert(Reduction.first);
7623       NeedDef.insert(Reduction.second.getLoopExitInstr());
7624     }
7625   }
7626 
7627   // Collect instructions from the original loop that will become trivially dead
7628   // in the vectorized loop. We don't need to vectorize these instructions. For
7629   // example, original induction update instructions can become dead because we
7630   // separately emit induction "steps" when generating code for the new loop.
7631   // Similarly, we create a new latch condition when setting up the structure
7632   // of the new loop, so the old one can become dead.
7633   SmallPtrSet<Instruction *, 4> DeadInstructions;
7634   collectTriviallyDeadInstructions(DeadInstructions);
7635 
7636   // Add assume instructions we need to drop to DeadInstructions, to prevent
7637   // them from being added to the VPlan.
7638   // TODO: We only need to drop assumes in blocks that get flattend. If the
7639   // control flow is preserved, we should keep them.
7640   auto &ConditionalAssumes = Legal->getConditionalAssumes();
7641   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
7642 
7643   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
7644   // Dead instructions do not need sinking. Remove them from SinkAfter.
7645   for (Instruction *I : DeadInstructions)
7646     SinkAfter.erase(I);
7647 
7648   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
7649     VFRange SubRange = {VF, MaxVF + 1};
7650     VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef,
7651                                              DeadInstructions, SinkAfter));
7652     VF = SubRange.End;
7653   }
7654 }
7655 
7656 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
7657     VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
7658     SmallPtrSetImpl<Instruction *> &DeadInstructions,
7659     const DenseMap<Instruction *, Instruction *> &SinkAfter) {
7660 
7661   // Hold a mapping from predicated instructions to their recipes, in order to
7662   // fix their AlsoPack behavior if a user is determined to replicate and use a
7663   // scalar instead of vector value.
7664   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
7665 
7666   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
7667 
7668   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
7669 
7670   // ---------------------------------------------------------------------------
7671   // Pre-construction: record ingredients whose recipes we'll need to further
7672   // process after constructing the initial VPlan.
7673   // ---------------------------------------------------------------------------
7674 
7675   // Mark instructions we'll need to sink later and their targets as
7676   // ingredients whose recipe we'll need to record.
7677   for (auto &Entry : SinkAfter) {
7678     RecipeBuilder.recordRecipeOf(Entry.first);
7679     RecipeBuilder.recordRecipeOf(Entry.second);
7680   }
7681   for (auto &Reduction : CM.getInLoopReductionChains()) {
7682     PHINode *Phi = Reduction.first;
7683     RecurrenceDescriptor::RecurrenceKind Kind =
7684         Legal->getReductionVars()[Phi].getRecurrenceKind();
7685     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
7686 
7687     RecipeBuilder.recordRecipeOf(Phi);
7688     for (auto &R : ReductionOperations) {
7689       RecipeBuilder.recordRecipeOf(R);
7690       // For min/max reducitons, where we have a pair of icmp/select, we also
7691       // need to record the ICmp recipe, so it can be removed later.
7692       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
7693           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
7694         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
7695       }
7696     }
7697   }
7698 
7699   // For each interleave group which is relevant for this (possibly trimmed)
7700   // Range, add it to the set of groups to be later applied to the VPlan and add
7701   // placeholders for its members' Recipes which we'll be replacing with a
7702   // single VPInterleaveRecipe.
7703   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
7704     auto applyIG = [IG, this](ElementCount VF) -> bool {
7705       return (VF.isVector() && // Query is illegal for VF == 1
7706               CM.getWideningDecision(IG->getInsertPos(), VF) ==
7707                   LoopVectorizationCostModel::CM_Interleave);
7708     };
7709     if (!getDecisionAndClampRange(applyIG, Range))
7710       continue;
7711     InterleaveGroups.insert(IG);
7712     for (unsigned i = 0; i < IG->getFactor(); i++)
7713       if (Instruction *Member = IG->getMember(i))
7714         RecipeBuilder.recordRecipeOf(Member);
7715   };
7716 
7717   // ---------------------------------------------------------------------------
7718   // Build initial VPlan: Scan the body of the loop in a topological order to
7719   // visit each basic block after having visited its predecessor basic blocks.
7720   // ---------------------------------------------------------------------------
7721 
7722   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
7723   auto Plan = std::make_unique<VPlan>();
7724   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
7725   Plan->setEntry(VPBB);
7726 
7727   // Represent values that will have defs inside VPlan.
7728   for (Value *V : NeedDef)
7729     Plan->addVPValue(V);
7730 
7731   // Scan the body of the loop in a topological order to visit each basic block
7732   // after having visited its predecessor basic blocks.
7733   LoopBlocksDFS DFS(OrigLoop);
7734   DFS.perform(LI);
7735 
7736   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
7737     // Relevant instructions from basic block BB will be grouped into VPRecipe
7738     // ingredients and fill a new VPBasicBlock.
7739     unsigned VPBBsForBB = 0;
7740     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
7741     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
7742     VPBB = FirstVPBBForBB;
7743     Builder.setInsertPoint(VPBB);
7744 
7745     // Introduce each ingredient into VPlan.
7746     // TODO: Model and preserve debug instrinsics in VPlan.
7747     for (Instruction &I : BB->instructionsWithoutDebug()) {
7748       Instruction *Instr = &I;
7749 
7750       // First filter out irrelevant instructions, to ensure no recipes are
7751       // built for them.
7752       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
7753         continue;
7754 
7755       if (auto Recipe =
7756               RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) {
7757         RecipeBuilder.setRecipe(Instr, Recipe);
7758         VPBB->appendRecipe(Recipe);
7759         continue;
7760       }
7761 
7762       // Otherwise, if all widening options failed, Instruction is to be
7763       // replicated. This may create a successor for VPBB.
7764       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
7765           Instr, Range, VPBB, PredInst2Recipe, Plan);
7766       if (NextVPBB != VPBB) {
7767         VPBB = NextVPBB;
7768         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
7769                                     : "");
7770       }
7771     }
7772   }
7773 
7774   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7775   // may also be empty, such as the last one VPBB, reflecting original
7776   // basic-blocks with no recipes.
7777   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
7778   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
7779   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
7780   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
7781   delete PreEntry;
7782 
7783   // ---------------------------------------------------------------------------
7784   // Transform initial VPlan: Apply previously taken decisions, in order, to
7785   // bring the VPlan to its final state.
7786   // ---------------------------------------------------------------------------
7787 
7788   // Apply Sink-After legal constraints.
7789   for (auto &Entry : SinkAfter) {
7790     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
7791     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
7792     Sink->moveAfter(Target);
7793   }
7794 
7795   // Interleave memory: for each Interleave Group we marked earlier as relevant
7796   // for this VPlan, replace the Recipes widening its memory instructions with a
7797   // single VPInterleaveRecipe at its insertion point.
7798   for (auto IG : InterleaveGroups) {
7799     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
7800         RecipeBuilder.getRecipe(IG->getInsertPos()));
7801     (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask()))
7802         ->insertBefore(Recipe);
7803 
7804     for (unsigned i = 0; i < IG->getFactor(); ++i)
7805       if (Instruction *Member = IG->getMember(i)) {
7806         RecipeBuilder.getRecipe(Member)->eraseFromParent();
7807       }
7808   }
7809 
7810   // Adjust the recipes for any inloop reductions.
7811   if (Range.Start > 1)
7812     adjustRecipesForInLoopReductions(Plan, RecipeBuilder);
7813 
7814   // Finally, if tail is folded by masking, introduce selects between the phi
7815   // and the live-out instruction of each reduction, at the end of the latch.
7816   if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) {
7817     Builder.setInsertPoint(VPBB);
7818     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
7819     for (auto &Reduction : Legal->getReductionVars()) {
7820       if (CM.isInLoopReduction(Reduction.first))
7821         continue;
7822       VPValue *Phi = Plan->getVPValue(Reduction.first);
7823       VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
7824       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
7825     }
7826   }
7827 
7828   std::string PlanName;
7829   raw_string_ostream RSO(PlanName);
7830   ElementCount VF = ElementCount::getFixed(Range.Start);
7831   Plan->addVF(VF);
7832   RSO << "Initial VPlan for VF={" << VF;
7833   for (VF *= 2; VF.getKnownMinValue() < Range.End; VF *= 2) {
7834     Plan->addVF(VF);
7835     RSO << "," << VF;
7836   }
7837   RSO << "},UF>=1";
7838   RSO.flush();
7839   Plan->setName(PlanName);
7840 
7841   return Plan;
7842 }
7843 
7844 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
7845   // Outer loop handling: They may require CFG and instruction level
7846   // transformations before even evaluating whether vectorization is profitable.
7847   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7848   // the vectorization pipeline.
7849   assert(!OrigLoop->isInnermost());
7850   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7851 
7852   // Create new empty VPlan
7853   auto Plan = std::make_unique<VPlan>();
7854 
7855   // Build hierarchical CFG
7856   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
7857   HCFGBuilder.buildHierarchicalCFG();
7858 
7859   for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
7860     Plan->addVF(ElementCount::getFixed(VF));
7861 
7862   if (EnableVPlanPredication) {
7863     VPlanPredicator VPP(*Plan);
7864     VPP.predicate();
7865 
7866     // Avoid running transformation to recipes until masked code generation in
7867     // VPlan-native path is in place.
7868     return Plan;
7869   }
7870 
7871   SmallPtrSet<Instruction *, 1> DeadInstructions;
7872   VPlanTransforms::VPInstructionsToVPRecipes(
7873       OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
7874   return Plan;
7875 }
7876 
7877 // Adjust the recipes for any inloop reductions. The chain of instructions
7878 // leading from the loop exit instr to the phi need to be converted to
7879 // reductions, with one operand being vector and the other being the scalar
7880 // reduction chain.
7881 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions(
7882     VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) {
7883   for (auto &Reduction : CM.getInLoopReductionChains()) {
7884     PHINode *Phi = Reduction.first;
7885     RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
7886     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
7887 
7888     // ReductionOperations are orders top-down from the phi's use to the
7889     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
7890     // which of the two operands will remain scalar and which will be reduced.
7891     // For minmax the chain will be the select instructions.
7892     Instruction *Chain = Phi;
7893     for (Instruction *R : ReductionOperations) {
7894       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
7895       RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc.getRecurrenceKind();
7896 
7897       VPValue *ChainOp = Plan->getVPValue(Chain);
7898       unsigned FirstOpId;
7899       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
7900           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
7901         assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
7902                "Expected to replace a VPWidenSelectSC");
7903         FirstOpId = 1;
7904       } else {
7905         assert(isa<VPWidenRecipe>(WidenRecipe) &&
7906                "Expected to replace a VPWidenSC");
7907         FirstOpId = 0;
7908       }
7909       unsigned VecOpId =
7910           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
7911       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
7912 
7913       auto *CondOp = CM.foldTailByMasking()
7914                          ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
7915                          : nullptr;
7916       VPReductionRecipe *RedRecipe = new VPReductionRecipe(
7917           &RdxDesc, R, ChainOp, VecOp, CondOp, Legal->hasFunNoNaNAttr(), TTI);
7918       WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
7919       WidenRecipe->eraseFromParent();
7920 
7921       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
7922           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
7923         VPRecipeBase *CompareRecipe =
7924             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
7925         assert(isa<VPWidenRecipe>(CompareRecipe) &&
7926                "Expected to replace a VPWidenSC");
7927         CompareRecipe->eraseFromParent();
7928       }
7929       Chain = R;
7930     }
7931   }
7932 }
7933 
7934 Value* LoopVectorizationPlanner::VPCallbackILV::
7935 getOrCreateVectorValues(Value *V, unsigned Part) {
7936       return ILV.getOrCreateVectorValue(V, Part);
7937 }
7938 
7939 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
7940     Value *V, const VPIteration &Instance) {
7941   return ILV.getOrCreateScalarValue(V, Instance);
7942 }
7943 
7944 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
7945                                VPSlotTracker &SlotTracker) const {
7946   O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
7947   IG->getInsertPos()->printAsOperand(O, false);
7948   O << ", ";
7949   getAddr()->printAsOperand(O, SlotTracker);
7950   VPValue *Mask = getMask();
7951   if (Mask) {
7952     O << ", ";
7953     Mask->printAsOperand(O, SlotTracker);
7954   }
7955   for (unsigned i = 0; i < IG->getFactor(); ++i)
7956     if (Instruction *I = IG->getMember(i))
7957       O << "\\l\" +\n" << Indent << "\"  " << VPlanIngredient(I) << " " << i;
7958 }
7959 
7960 void VPWidenCallRecipe::execute(VPTransformState &State) {
7961   State.ILV->widenCallInstruction(Ingredient, *this, State);
7962 }
7963 
7964 void VPWidenSelectRecipe::execute(VPTransformState &State) {
7965   State.ILV->widenSelectInstruction(Ingredient, *this, InvariantCond, State);
7966 }
7967 
7968 void VPWidenRecipe::execute(VPTransformState &State) {
7969   State.ILV->widenInstruction(Ingredient, *this, State);
7970 }
7971 
7972 void VPWidenGEPRecipe::execute(VPTransformState &State) {
7973   State.ILV->widenGEP(GEP, *this, State.UF, State.VF, IsPtrLoopInvariant,
7974                       IsIndexLoopInvariant, State);
7975 }
7976 
7977 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
7978   assert(!State.Instance && "Int or FP induction being replicated.");
7979   State.ILV->widenIntOrFpInduction(IV, Trunc);
7980 }
7981 
7982 void VPWidenPHIRecipe::execute(VPTransformState &State) {
7983   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7984 }
7985 
7986 void VPBlendRecipe::execute(VPTransformState &State) {
7987   State.ILV->setDebugLocFromInst(State.Builder, Phi);
7988   // We know that all PHIs in non-header blocks are converted into
7989   // selects, so we don't have to worry about the insertion order and we
7990   // can just use the builder.
7991   // At this point we generate the predication tree. There may be
7992   // duplications since this is a simple recursive scan, but future
7993   // optimizations will clean it up.
7994 
7995   unsigned NumIncoming = getNumIncomingValues();
7996 
7997   // Generate a sequence of selects of the form:
7998   // SELECT(Mask3, In3,
7999   //        SELECT(Mask2, In2,
8000   //               SELECT(Mask1, In1,
8001   //                      In0)))
8002   // Note that Mask0 is never used: lanes for which no path reaches this phi and
8003   // are essentially undef are taken from In0.
8004   InnerLoopVectorizer::VectorParts Entry(State.UF);
8005   for (unsigned In = 0; In < NumIncoming; ++In) {
8006     for (unsigned Part = 0; Part < State.UF; ++Part) {
8007       // We might have single edge PHIs (blocks) - use an identity
8008       // 'select' for the first PHI operand.
8009       Value *In0 = State.get(getIncomingValue(In), Part);
8010       if (In == 0)
8011         Entry[Part] = In0; // Initialize with the first incoming value.
8012       else {
8013         // Select between the current value and the previous incoming edge
8014         // based on the incoming mask.
8015         Value *Cond = State.get(getMask(In), Part);
8016         Entry[Part] =
8017             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
8018       }
8019     }
8020   }
8021   for (unsigned Part = 0; Part < State.UF; ++Part)
8022     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
8023 }
8024 
8025 void VPInterleaveRecipe::execute(VPTransformState &State) {
8026   assert(!State.Instance && "Interleave group being replicated.");
8027   State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getMask());
8028 }
8029 
8030 void VPReductionRecipe::execute(VPTransformState &State) {
8031   assert(!State.Instance && "Reduction being replicated.");
8032   for (unsigned Part = 0; Part < State.UF; ++Part) {
8033     RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc->getRecurrenceKind();
8034     Value *NewVecOp = State.get(VecOp, Part);
8035     if (CondOp) {
8036       Value *NewCond = State.get(CondOp, Part);
8037       VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
8038       Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
8039           Kind, RdxDesc->getMinMaxRecurrenceKind(), VecTy->getElementType());
8040       Constant *IdenVec =
8041           ConstantVector::getSplat(VecTy->getElementCount(), Iden);
8042       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
8043       NewVecOp = Select;
8044     }
8045     Value *NewRed =
8046         createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp, NoNaN);
8047     Value *PrevInChain = State.get(ChainOp, Part);
8048     Value *NextInChain;
8049     if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
8050         Kind == RecurrenceDescriptor::RK_FloatMinMax) {
8051       NextInChain =
8052           createMinMaxOp(State.Builder, RdxDesc->getMinMaxRecurrenceKind(),
8053                          NewRed, PrevInChain);
8054     } else {
8055       NextInChain = State.Builder.CreateBinOp(
8056           (Instruction::BinaryOps)I->getOpcode(), NewRed, PrevInChain);
8057     }
8058     State.ValueMap.setVectorValue(I, Part, NextInChain);
8059   }
8060 }
8061 
8062 void VPReplicateRecipe::execute(VPTransformState &State) {
8063   if (State.Instance) { // Generate a single instance.
8064     State.ILV->scalarizeInstruction(Ingredient, *this, *State.Instance,
8065                                     IsPredicated, State);
8066     // Insert scalar instance packing it into a vector.
8067     if (AlsoPack && State.VF.isVector()) {
8068       // If we're constructing lane 0, initialize to start from undef.
8069       if (State.Instance->Lane == 0) {
8070         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
8071         Value *Undef =
8072             UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
8073         State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
8074       }
8075       State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
8076     }
8077     return;
8078   }
8079 
8080   // Generate scalar instances for all VF lanes of all UF parts, unless the
8081   // instruction is uniform inwhich case generate only the first lane for each
8082   // of the UF parts.
8083   unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
8084   for (unsigned Part = 0; Part < State.UF; ++Part)
8085     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
8086       State.ILV->scalarizeInstruction(Ingredient, *this, {Part, Lane},
8087                                       IsPredicated, State);
8088 }
8089 
8090 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
8091   assert(State.Instance && "Branch on Mask works only on single instance.");
8092 
8093   unsigned Part = State.Instance->Part;
8094   unsigned Lane = State.Instance->Lane;
8095 
8096   Value *ConditionBit = nullptr;
8097   VPValue *BlockInMask = getMask();
8098   if (BlockInMask) {
8099     ConditionBit = State.get(BlockInMask, Part);
8100     if (ConditionBit->getType()->isVectorTy())
8101       ConditionBit = State.Builder.CreateExtractElement(
8102           ConditionBit, State.Builder.getInt32(Lane));
8103   } else // Block in mask is all-one.
8104     ConditionBit = State.Builder.getTrue();
8105 
8106   // Replace the temporary unreachable terminator with a new conditional branch,
8107   // whose two destinations will be set later when they are created.
8108   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
8109   assert(isa<UnreachableInst>(CurrentTerminator) &&
8110          "Expected to replace unreachable terminator with conditional branch.");
8111   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
8112   CondBr->setSuccessor(0, nullptr);
8113   ReplaceInstWithInst(CurrentTerminator, CondBr);
8114 }
8115 
8116 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
8117   assert(State.Instance && "Predicated instruction PHI works per instance.");
8118   Instruction *ScalarPredInst = cast<Instruction>(
8119       State.ValueMap.getScalarValue(PredInst, *State.Instance));
8120   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
8121   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
8122   assert(PredicatingBB && "Predicated block has no single predecessor.");
8123 
8124   // By current pack/unpack logic we need to generate only a single phi node: if
8125   // a vector value for the predicated instruction exists at this point it means
8126   // the instruction has vector users only, and a phi for the vector value is
8127   // needed. In this case the recipe of the predicated instruction is marked to
8128   // also do that packing, thereby "hoisting" the insert-element sequence.
8129   // Otherwise, a phi node for the scalar value is needed.
8130   unsigned Part = State.Instance->Part;
8131   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
8132     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
8133     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
8134     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
8135     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
8136     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
8137     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
8138   } else {
8139     Type *PredInstType = PredInst->getType();
8140     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
8141     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
8142     Phi->addIncoming(ScalarPredInst, PredicatedBB);
8143     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
8144   }
8145 }
8146 
8147 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
8148   VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr;
8149   State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue,
8150                                         getMask());
8151 }
8152 
8153 // Determine how to lower the scalar epilogue, which depends on 1) optimising
8154 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
8155 // predication, and 4) a TTI hook that analyses whether the loop is suitable
8156 // for predication.
8157 static ScalarEpilogueLowering getScalarEpilogueLowering(
8158     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
8159     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
8160     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
8161     LoopVectorizationLegality &LVL) {
8162   // 1) OptSize takes precedence over all other options, i.e. if this is set,
8163   // don't look at hints or options, and don't request a scalar epilogue.
8164   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
8165   // LoopAccessInfo (due to code dependency and not being able to reliably get
8166   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
8167   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
8168   // versioning when the vectorization is forced, unlike hasOptSize. So revert
8169   // back to the old way and vectorize with versioning when forced. See D81345.)
8170   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
8171                                                       PGSOQueryType::IRPass) &&
8172                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
8173     return CM_ScalarEpilogueNotAllowedOptSize;
8174 
8175   bool PredicateOptDisabled = PreferPredicateOverEpilogue.getNumOccurrences() &&
8176                               !PreferPredicateOverEpilogue;
8177 
8178   // 2) Next, if disabling predication is requested on the command line, honour
8179   // this and request a scalar epilogue.
8180   if (PredicateOptDisabled)
8181     return CM_ScalarEpilogueAllowed;
8182 
8183   // 3) and 4) look if enabling predication is requested on the command line,
8184   // with a loop hint, or if the TTI hook indicates this is profitable, request
8185   // predication.
8186   if (PreferPredicateOverEpilogue ||
8187       Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
8188       (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
8189                                         LVL.getLAI()) &&
8190        Hints.getPredicate() != LoopVectorizeHints::FK_Disabled))
8191     return CM_ScalarEpilogueNotNeededUsePredicate;
8192 
8193   return CM_ScalarEpilogueAllowed;
8194 }
8195 
8196 // Process the loop in the VPlan-native vectorization path. This path builds
8197 // VPlan upfront in the vectorization pipeline, which allows to apply
8198 // VPlan-to-VPlan transformations from the very beginning without modifying the
8199 // input LLVM IR.
8200 static bool processLoopInVPlanNativePath(
8201     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
8202     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
8203     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
8204     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
8205     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
8206 
8207   if (PSE.getBackedgeTakenCount() == PSE.getSE()->getCouldNotCompute()) {
8208     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
8209     return false;
8210   }
8211   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
8212   Function *F = L->getHeader()->getParent();
8213   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
8214 
8215   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
8216       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
8217 
8218   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
8219                                 &Hints, IAI);
8220   // Use the planner for outer loop vectorization.
8221   // TODO: CM is not used at this point inside the planner. Turn CM into an
8222   // optional argument if we don't need it in the future.
8223   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE);
8224 
8225   // Get user vectorization factor.
8226   const unsigned UserVF = Hints.getWidth();
8227 
8228   // Plan how to best vectorize, return the best VF and its cost.
8229   const VectorizationFactor VF =
8230       LVP.planInVPlanNativePath(ElementCount::getFixed(UserVF));
8231 
8232   // If we are stress testing VPlan builds, do not attempt to generate vector
8233   // code. Masked vector code generation support will follow soon.
8234   // Also, do not attempt to vectorize if no vector code will be produced.
8235   if (VPlanBuildStressTest || EnableVPlanPredication ||
8236       VectorizationFactor::Disabled() == VF)
8237     return false;
8238 
8239   LVP.setBestPlan(VF.Width, 1);
8240 
8241   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
8242                          &CM, BFI, PSI);
8243   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
8244                     << L->getHeader()->getParent()->getName() << "\"\n");
8245   LVP.executePlan(LB, DT);
8246 
8247   // Mark the loop as already vectorized to avoid vectorizing again.
8248   Hints.setAlreadyVectorized();
8249 
8250   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
8251   return true;
8252 }
8253 
8254 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
8255     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
8256                                !EnableLoopInterleaving),
8257       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
8258                               !EnableLoopVectorization) {}
8259 
8260 bool LoopVectorizePass::processLoop(Loop *L) {
8261   assert((EnableVPlanNativePath || L->isInnermost()) &&
8262          "VPlan-native path is not enabled. Only process inner loops.");
8263 
8264 #ifndef NDEBUG
8265   const std::string DebugLocStr = getDebugLocString(L);
8266 #endif /* NDEBUG */
8267 
8268   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
8269                     << L->getHeader()->getParent()->getName() << "\" from "
8270                     << DebugLocStr << "\n");
8271 
8272   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
8273 
8274   LLVM_DEBUG(
8275       dbgs() << "LV: Loop hints:"
8276              << " force="
8277              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
8278                      ? "disabled"
8279                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
8280                             ? "enabled"
8281                             : "?"))
8282              << " width=" << Hints.getWidth()
8283              << " unroll=" << Hints.getInterleave() << "\n");
8284 
8285   // Function containing loop
8286   Function *F = L->getHeader()->getParent();
8287 
8288   // Looking at the diagnostic output is the only way to determine if a loop
8289   // was vectorized (other than looking at the IR or machine code), so it
8290   // is important to generate an optimization remark for each loop. Most of
8291   // these messages are generated as OptimizationRemarkAnalysis. Remarks
8292   // generated as OptimizationRemark and OptimizationRemarkMissed are
8293   // less verbose reporting vectorized loops and unvectorized loops that may
8294   // benefit from vectorization, respectively.
8295 
8296   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
8297     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
8298     return false;
8299   }
8300 
8301   PredicatedScalarEvolution PSE(*SE, *L);
8302 
8303   // Check if it is legal to vectorize the loop.
8304   LoopVectorizationRequirements Requirements(*ORE);
8305   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
8306                                 &Requirements, &Hints, DB, AC, BFI, PSI);
8307   if (!LVL.canVectorize(EnableVPlanNativePath)) {
8308     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
8309     Hints.emitRemarkWithHints();
8310     return false;
8311   }
8312 
8313   // Check the function attributes and profiles to find out if this function
8314   // should be optimized for size.
8315   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
8316       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
8317 
8318   // Entrance to the VPlan-native vectorization path. Outer loops are processed
8319   // here. They may require CFG and instruction level transformations before
8320   // even evaluating whether vectorization is profitable. Since we cannot modify
8321   // the incoming IR, we need to build VPlan upfront in the vectorization
8322   // pipeline.
8323   if (!L->isInnermost())
8324     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
8325                                         ORE, BFI, PSI, Hints);
8326 
8327   assert(L->isInnermost() && "Inner loop expected.");
8328 
8329   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
8330   // count by optimizing for size, to minimize overheads.
8331   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
8332   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
8333     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
8334                       << "This loop is worth vectorizing only if no scalar "
8335                       << "iteration overheads are incurred.");
8336     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
8337       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
8338     else {
8339       LLVM_DEBUG(dbgs() << "\n");
8340       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
8341     }
8342   }
8343 
8344   // Check the function attributes to see if implicit floats are allowed.
8345   // FIXME: This check doesn't seem possibly correct -- what if the loop is
8346   // an integer loop and the vector instructions selected are purely integer
8347   // vector instructions?
8348   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
8349     reportVectorizationFailure(
8350         "Can't vectorize when the NoImplicitFloat attribute is used",
8351         "loop not vectorized due to NoImplicitFloat attribute",
8352         "NoImplicitFloat", ORE, L);
8353     Hints.emitRemarkWithHints();
8354     return false;
8355   }
8356 
8357   // Check if the target supports potentially unsafe FP vectorization.
8358   // FIXME: Add a check for the type of safety issue (denormal, signaling)
8359   // for the target we're vectorizing for, to make sure none of the
8360   // additional fp-math flags can help.
8361   if (Hints.isPotentiallyUnsafe() &&
8362       TTI->isFPVectorizationPotentiallyUnsafe()) {
8363     reportVectorizationFailure(
8364         "Potentially unsafe FP op prevents vectorization",
8365         "loop not vectorized due to unsafe FP support.",
8366         "UnsafeFP", ORE, L);
8367     Hints.emitRemarkWithHints();
8368     return false;
8369   }
8370 
8371   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
8372   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
8373 
8374   // If an override option has been passed in for interleaved accesses, use it.
8375   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
8376     UseInterleaved = EnableInterleavedMemAccesses;
8377 
8378   // Analyze interleaved memory accesses.
8379   if (UseInterleaved) {
8380     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
8381   }
8382 
8383   // Use the cost model.
8384   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
8385                                 F, &Hints, IAI);
8386   CM.collectValuesToIgnore();
8387 
8388   // Use the planner for vectorization.
8389   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);
8390 
8391   // Get user vectorization factor and interleave count.
8392   unsigned UserVF = Hints.getWidth();
8393   unsigned UserIC = Hints.getInterleave();
8394 
8395   // Plan how to best vectorize, return the best VF and its cost.
8396   Optional<VectorizationFactor> MaybeVF =
8397       LVP.plan(ElementCount::getFixed(UserVF), UserIC);
8398 
8399   VectorizationFactor VF = VectorizationFactor::Disabled();
8400   unsigned IC = 1;
8401 
8402   if (MaybeVF) {
8403     VF = *MaybeVF;
8404     // Select the interleave count.
8405     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
8406   }
8407 
8408   // Identify the diagnostic messages that should be produced.
8409   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
8410   bool VectorizeLoop = true, InterleaveLoop = true;
8411   if (Requirements.doesNotMeet(F, L, Hints)) {
8412     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
8413                          "requirements.\n");
8414     Hints.emitRemarkWithHints();
8415     return false;
8416   }
8417 
8418   if (VF.Width.isScalar()) {
8419     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
8420     VecDiagMsg = std::make_pair(
8421         "VectorizationNotBeneficial",
8422         "the cost-model indicates that vectorization is not beneficial");
8423     VectorizeLoop = false;
8424   }
8425 
8426   if (!MaybeVF && UserIC > 1) {
8427     // Tell the user interleaving was avoided up-front, despite being explicitly
8428     // requested.
8429     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
8430                          "interleaving should be avoided up front\n");
8431     IntDiagMsg = std::make_pair(
8432         "InterleavingAvoided",
8433         "Ignoring UserIC, because interleaving was avoided up front");
8434     InterleaveLoop = false;
8435   } else if (IC == 1 && UserIC <= 1) {
8436     // Tell the user interleaving is not beneficial.
8437     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
8438     IntDiagMsg = std::make_pair(
8439         "InterleavingNotBeneficial",
8440         "the cost-model indicates that interleaving is not beneficial");
8441     InterleaveLoop = false;
8442     if (UserIC == 1) {
8443       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
8444       IntDiagMsg.second +=
8445           " and is explicitly disabled or interleave count is set to 1";
8446     }
8447   } else if (IC > 1 && UserIC == 1) {
8448     // Tell the user interleaving is beneficial, but it explicitly disabled.
8449     LLVM_DEBUG(
8450         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
8451     IntDiagMsg = std::make_pair(
8452         "InterleavingBeneficialButDisabled",
8453         "the cost-model indicates that interleaving is beneficial "
8454         "but is explicitly disabled or interleave count is set to 1");
8455     InterleaveLoop = false;
8456   }
8457 
8458   // Override IC if user provided an interleave count.
8459   IC = UserIC > 0 ? UserIC : IC;
8460 
8461   // Emit diagnostic messages, if any.
8462   const char *VAPassName = Hints.vectorizeAnalysisPassName();
8463   if (!VectorizeLoop && !InterleaveLoop) {
8464     // Do not vectorize or interleaving the loop.
8465     ORE->emit([&]() {
8466       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
8467                                       L->getStartLoc(), L->getHeader())
8468              << VecDiagMsg.second;
8469     });
8470     ORE->emit([&]() {
8471       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
8472                                       L->getStartLoc(), L->getHeader())
8473              << IntDiagMsg.second;
8474     });
8475     return false;
8476   } else if (!VectorizeLoop && InterleaveLoop) {
8477     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
8478     ORE->emit([&]() {
8479       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
8480                                         L->getStartLoc(), L->getHeader())
8481              << VecDiagMsg.second;
8482     });
8483   } else if (VectorizeLoop && !InterleaveLoop) {
8484     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
8485                       << ") in " << DebugLocStr << '\n');
8486     ORE->emit([&]() {
8487       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
8488                                         L->getStartLoc(), L->getHeader())
8489              << IntDiagMsg.second;
8490     });
8491   } else if (VectorizeLoop && InterleaveLoop) {
8492     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
8493                       << ") in " << DebugLocStr << '\n');
8494     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
8495   }
8496 
8497   LVP.setBestPlan(VF.Width, IC);
8498 
8499   using namespace ore;
8500   bool DisableRuntimeUnroll = false;
8501   MDNode *OrigLoopID = L->getLoopID();
8502 
8503   if (!VectorizeLoop) {
8504     assert(IC > 1 && "interleave count should not be 1 or 0");
8505     // If we decided that it is not legal to vectorize the loop, then
8506     // interleave it.
8507     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM,
8508                                BFI, PSI);
8509     LVP.executePlan(Unroller, DT);
8510 
8511     ORE->emit([&]() {
8512       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
8513                                 L->getHeader())
8514              << "interleaved loop (interleaved count: "
8515              << NV("InterleaveCount", IC) << ")";
8516     });
8517   } else {
8518     // If we decided that it is *legal* to vectorize the loop, then do it.
8519     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
8520                            &LVL, &CM, BFI, PSI);
8521     LVP.executePlan(LB, DT);
8522     ++LoopsVectorized;
8523 
8524     // Add metadata to disable runtime unrolling a scalar loop when there are
8525     // no runtime checks about strides and memory. A scalar loop that is
8526     // rarely used is not worth unrolling.
8527     if (!LB.areSafetyChecksAdded())
8528       DisableRuntimeUnroll = true;
8529 
8530     // Report the vectorization decision.
8531     ORE->emit([&]() {
8532       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
8533                                 L->getHeader())
8534              << "vectorized loop (vectorization width: "
8535              << NV("VectorizationFactor", VF.Width)
8536              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
8537     });
8538   }
8539 
8540   Optional<MDNode *> RemainderLoopID =
8541       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
8542                                       LLVMLoopVectorizeFollowupEpilogue});
8543   if (RemainderLoopID.hasValue()) {
8544     L->setLoopID(RemainderLoopID.getValue());
8545   } else {
8546     if (DisableRuntimeUnroll)
8547       AddRuntimeUnrollDisableMetaData(L);
8548 
8549     // Mark the loop as already vectorized to avoid vectorizing again.
8550     Hints.setAlreadyVectorized();
8551   }
8552 
8553   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
8554   return true;
8555 }
8556 
8557 LoopVectorizeResult LoopVectorizePass::runImpl(
8558     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
8559     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
8560     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
8561     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
8562     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
8563   SE = &SE_;
8564   LI = &LI_;
8565   TTI = &TTI_;
8566   DT = &DT_;
8567   BFI = &BFI_;
8568   TLI = TLI_;
8569   AA = &AA_;
8570   AC = &AC_;
8571   GetLAA = &GetLAA_;
8572   DB = &DB_;
8573   ORE = &ORE_;
8574   PSI = PSI_;
8575 
8576   // Don't attempt if
8577   // 1. the target claims to have no vector registers, and
8578   // 2. interleaving won't help ILP.
8579   //
8580   // The second condition is necessary because, even if the target has no
8581   // vector registers, loop vectorization may still enable scalar
8582   // interleaving.
8583   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
8584       TTI->getMaxInterleaveFactor(1) < 2)
8585     return LoopVectorizeResult(false, false);
8586 
8587   bool Changed = false, CFGChanged = false;
8588 
8589   // The vectorizer requires loops to be in simplified form.
8590   // Since simplification may add new inner loops, it has to run before the
8591   // legality and profitability checks. This means running the loop vectorizer
8592   // will simplify all loops, regardless of whether anything end up being
8593   // vectorized.
8594   for (auto &L : *LI)
8595     Changed |= CFGChanged |=
8596         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
8597 
8598   // Build up a worklist of inner-loops to vectorize. This is necessary as
8599   // the act of vectorizing or partially unrolling a loop creates new loops
8600   // and can invalidate iterators across the loops.
8601   SmallVector<Loop *, 8> Worklist;
8602 
8603   for (Loop *L : *LI)
8604     collectSupportedLoops(*L, LI, ORE, Worklist);
8605 
8606   LoopsAnalyzed += Worklist.size();
8607 
8608   // Now walk the identified inner loops.
8609   while (!Worklist.empty()) {
8610     Loop *L = Worklist.pop_back_val();
8611 
8612     // For the inner loops we actually process, form LCSSA to simplify the
8613     // transform.
8614     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
8615 
8616     Changed |= CFGChanged |= processLoop(L);
8617   }
8618 
8619   // Process each loop nest in the function.
8620   return LoopVectorizeResult(Changed, CFGChanged);
8621 }
8622 
8623 PreservedAnalyses LoopVectorizePass::run(Function &F,
8624                                          FunctionAnalysisManager &AM) {
8625     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
8626     auto &LI = AM.getResult<LoopAnalysis>(F);
8627     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
8628     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
8629     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
8630     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
8631     auto &AA = AM.getResult<AAManager>(F);
8632     auto &AC = AM.getResult<AssumptionAnalysis>(F);
8633     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
8634     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
8635     MemorySSA *MSSA = EnableMSSALoopDependency
8636                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
8637                           : nullptr;
8638 
8639     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
8640     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
8641         [&](Loop &L) -> const LoopAccessInfo & {
8642       LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,  SE,
8643                                         TLI, TTI, nullptr, MSSA};
8644       return LAM.getResult<LoopAccessAnalysis>(L, AR);
8645     };
8646     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
8647     ProfileSummaryInfo *PSI =
8648         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
8649     LoopVectorizeResult Result =
8650         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
8651     if (!Result.MadeAnyChange)
8652       return PreservedAnalyses::all();
8653     PreservedAnalyses PA;
8654 
8655     // We currently do not preserve loopinfo/dominator analyses with outer loop
8656     // vectorization. Until this is addressed, mark these analyses as preserved
8657     // only for non-VPlan-native path.
8658     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
8659     if (!EnableVPlanNativePath) {
8660       PA.preserve<LoopAnalysis>();
8661       PA.preserve<DominatorTreeAnalysis>();
8662     }
8663     PA.preserve<BasicAA>();
8664     PA.preserve<GlobalsAA>();
8665     if (!Result.MadeCFGChange)
8666       PA.preserveSet<CFGAnalyses>();
8667     return PA;
8668 }
8669