1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
95 #include "llvm/Analysis/TargetLibraryInfo.h"
96 #include "llvm/Analysis/TargetTransformInfo.h"
97 #include "llvm/Analysis/VectorUtils.h"
98 #include "llvm/IR/Attributes.h"
99 #include "llvm/IR/BasicBlock.h"
100 #include "llvm/IR/CFG.h"
101 #include "llvm/IR/Constant.h"
102 #include "llvm/IR/Constants.h"
103 #include "llvm/IR/DataLayout.h"
104 #include "llvm/IR/DebugInfoMetadata.h"
105 #include "llvm/IR/DebugLoc.h"
106 #include "llvm/IR/DerivedTypes.h"
107 #include "llvm/IR/DiagnosticInfo.h"
108 #include "llvm/IR/Dominators.h"
109 #include "llvm/IR/Function.h"
110 #include "llvm/IR/IRBuilder.h"
111 #include "llvm/IR/InstrTypes.h"
112 #include "llvm/IR/Instruction.h"
113 #include "llvm/IR/Instructions.h"
114 #include "llvm/IR/IntrinsicInst.h"
115 #include "llvm/IR/Intrinsics.h"
116 #include "llvm/IR/LLVMContext.h"
117 #include "llvm/IR/Metadata.h"
118 #include "llvm/IR/Module.h"
119 #include "llvm/IR/Operator.h"
120 #include "llvm/IR/Type.h"
121 #include "llvm/IR/Use.h"
122 #include "llvm/IR/User.h"
123 #include "llvm/IR/Value.h"
124 #include "llvm/IR/ValueHandle.h"
125 #include "llvm/IR/Verifier.h"
126 #include "llvm/InitializePasses.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
137 #include "llvm/Transforms/Utils/LoopSimplify.h"
138 #include "llvm/Transforms/Utils/LoopUtils.h"
139 #include "llvm/Transforms/Utils/LoopVersioning.h"
140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143 #include <algorithm>
144 #include <cassert>
145 #include <cstdint>
146 #include <cstdlib>
147 #include <functional>
148 #include <iterator>
149 #include <limits>
150 #include <memory>
151 #include <string>
152 #include <tuple>
153 #include <utility>
154 
155 using namespace llvm;
156 
157 #define LV_NAME "loop-vectorize"
158 #define DEBUG_TYPE LV_NAME
159 
160 /// @{
161 /// Metadata attribute names
162 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
163 const char LLVMLoopVectorizeFollowupVectorized[] =
164     "llvm.loop.vectorize.followup_vectorized";
165 const char LLVMLoopVectorizeFollowupEpilogue[] =
166     "llvm.loop.vectorize.followup_epilogue";
167 /// @}
168 
169 STATISTIC(LoopsVectorized, "Number of loops vectorized");
170 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
171 
172 /// Loops with a known constant trip count below this number are vectorized only
173 /// if no scalar iteration overheads are incurred.
174 static cl::opt<unsigned> TinyTripCountVectorThreshold(
175     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
176     cl::desc("Loops with a constant trip count that is smaller than this "
177              "value are vectorized only if no scalar iteration overheads "
178              "are incurred."));
179 
180 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
181 // that predication is preferred, and this lists all options. I.e., the
182 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
183 // and predicate the instructions accordingly. If tail-folding fails, there are
184 // different fallback strategies depending on these values:
185 namespace PreferPredicateTy {
186   enum Option {
187     ScalarEpilogue = 0,
188     PredicateElseScalarEpilogue,
189     PredicateOrDontVectorize
190   };
191 } // namespace PreferPredicateTy
192 
193 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
194     "prefer-predicate-over-epilogue",
195     cl::init(PreferPredicateTy::ScalarEpilogue),
196     cl::Hidden,
197     cl::desc("Tail-folding and predication preferences over creating a scalar "
198              "epilogue loop."),
199     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
200                          "scalar-epilogue",
201                          "Don't tail-predicate loops, create scalar epilogue"),
202               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
203                          "predicate-else-scalar-epilogue",
204                          "prefer tail-folding, create scalar epilogue if tail "
205                          "folding fails."),
206               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
207                          "predicate-dont-vectorize",
208                          "prefers tail-folding, don't attempt vectorization if "
209                          "tail-folding fails.")));
210 
211 static cl::opt<bool> MaximizeBandwidth(
212     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
213     cl::desc("Maximize bandwidth when selecting vectorization factor which "
214              "will be determined by the smallest type in loop."));
215 
216 static cl::opt<bool> EnableInterleavedMemAccesses(
217     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
218     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
219 
220 /// An interleave-group may need masking if it resides in a block that needs
221 /// predication, or in order to mask away gaps.
222 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
223     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
224     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
225 
226 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
227     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
228     cl::desc("We don't interleave loops with a estimated constant trip count "
229              "below this number"));
230 
231 static cl::opt<unsigned> ForceTargetNumScalarRegs(
232     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
233     cl::desc("A flag that overrides the target's number of scalar registers."));
234 
235 static cl::opt<unsigned> ForceTargetNumVectorRegs(
236     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
237     cl::desc("A flag that overrides the target's number of vector registers."));
238 
239 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
240     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
241     cl::desc("A flag that overrides the target's max interleave factor for "
242              "scalar loops."));
243 
244 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
245     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
246     cl::desc("A flag that overrides the target's max interleave factor for "
247              "vectorized loops."));
248 
249 static cl::opt<unsigned> ForceTargetInstructionCost(
250     "force-target-instruction-cost", cl::init(0), cl::Hidden,
251     cl::desc("A flag that overrides the target's expected cost for "
252              "an instruction to a single constant value. Mostly "
253              "useful for getting consistent testing."));
254 
255 static cl::opt<unsigned> SmallLoopCost(
256     "small-loop-cost", cl::init(20), cl::Hidden,
257     cl::desc(
258         "The cost of a loop that is considered 'small' by the interleaver."));
259 
260 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
261     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
262     cl::desc("Enable the use of the block frequency analysis to access PGO "
263              "heuristics minimizing code growth in cold regions and being more "
264              "aggressive in hot regions."));
265 
266 // Runtime interleave loops for load/store throughput.
267 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
268     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
269     cl::desc(
270         "Enable runtime interleaving until load/store ports are saturated"));
271 
272 /// Interleave small loops with scalar reductions.
273 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
274     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
275     cl::desc("Enable interleaving for loops with small iteration counts that "
276              "contain scalar reductions to expose ILP."));
277 
278 /// The number of stores in a loop that are allowed to need predication.
279 static cl::opt<unsigned> NumberOfStoresToPredicate(
280     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
281     cl::desc("Max number of stores to be predicated behind an if."));
282 
283 static cl::opt<bool> EnableIndVarRegisterHeur(
284     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
285     cl::desc("Count the induction variable only once when interleaving"));
286 
287 static cl::opt<bool> EnableCondStoresVectorization(
288     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
289     cl::desc("Enable if predication of stores during vectorization."));
290 
291 static cl::opt<unsigned> MaxNestedScalarReductionIC(
292     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
293     cl::desc("The maximum interleave count to use when interleaving a scalar "
294              "reduction in a nested loop."));
295 
296 static cl::opt<bool>
297     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
298                            cl::Hidden,
299                            cl::desc("Prefer in-loop vector reductions, "
300                                     "overriding the targets preference."));
301 
302 static cl::opt<bool> PreferPredicatedReductionSelect(
303     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
304     cl::desc(
305         "Prefer predicating a reduction operation over an after loop select."));
306 
307 cl::opt<bool> EnableVPlanNativePath(
308     "enable-vplan-native-path", cl::init(false), cl::Hidden,
309     cl::desc("Enable VPlan-native vectorization path with "
310              "support for outer loop vectorization."));
311 
312 // FIXME: Remove this switch once we have divergence analysis. Currently we
313 // assume divergent non-backedge branches when this switch is true.
314 cl::opt<bool> EnableVPlanPredication(
315     "enable-vplan-predication", cl::init(false), cl::Hidden,
316     cl::desc("Enable VPlan-native vectorization path predicator with "
317              "support for outer loop vectorization."));
318 
319 // This flag enables the stress testing of the VPlan H-CFG construction in the
320 // VPlan-native vectorization path. It must be used in conjuction with
321 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
322 // verification of the H-CFGs built.
323 static cl::opt<bool> VPlanBuildStressTest(
324     "vplan-build-stress-test", cl::init(false), cl::Hidden,
325     cl::desc(
326         "Build VPlan for every supported loop nest in the function and bail "
327         "out right after the build (stress test the VPlan H-CFG construction "
328         "in the VPlan-native vectorization path)."));
329 
330 cl::opt<bool> llvm::EnableLoopInterleaving(
331     "interleave-loops", cl::init(true), cl::Hidden,
332     cl::desc("Enable loop interleaving in Loop vectorization passes"));
333 cl::opt<bool> llvm::EnableLoopVectorization(
334     "vectorize-loops", cl::init(true), cl::Hidden,
335     cl::desc("Run the Loop vectorization passes"));
336 
337 /// A helper function that returns the type of loaded or stored value.
338 static Type *getMemInstValueType(Value *I) {
339   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
340          "Expected Load or Store instruction");
341   if (auto *LI = dyn_cast<LoadInst>(I))
342     return LI->getType();
343   return cast<StoreInst>(I)->getValueOperand()->getType();
344 }
345 
346 /// A helper function that returns true if the given type is irregular. The
347 /// type is irregular if its allocated size doesn't equal the store size of an
348 /// element of the corresponding vector type at the given vectorization factor.
349 static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) {
350   assert(!VF.isScalable() && "scalable vectors not yet supported.");
351   // Determine if an array of VF elements of type Ty is "bitcast compatible"
352   // with a <VF x Ty> vector.
353   if (VF.isVector()) {
354     auto *VectorTy = VectorType::get(Ty, VF);
355     return TypeSize::get(VF.getKnownMinValue() *
356                              DL.getTypeAllocSize(Ty).getFixedValue(),
357                          VF.isScalable()) != DL.getTypeStoreSize(VectorTy);
358   }
359 
360   // If the vectorization factor is one, we just check if an array of type Ty
361   // requires padding between elements.
362   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
363 }
364 
365 /// A helper function that returns the reciprocal of the block probability of
366 /// predicated blocks. If we return X, we are assuming the predicated block
367 /// will execute once for every X iterations of the loop header.
368 ///
369 /// TODO: We should use actual block probability here, if available. Currently,
370 ///       we always assume predicated blocks have a 50% chance of executing.
371 static unsigned getReciprocalPredBlockProb() { return 2; }
372 
373 /// A helper function that adds a 'fast' flag to floating-point operations.
374 static Value *addFastMathFlag(Value *V) {
375   if (isa<FPMathOperator>(V))
376     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
377   return V;
378 }
379 
380 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
381   if (isa<FPMathOperator>(V))
382     cast<Instruction>(V)->setFastMathFlags(FMF);
383   return V;
384 }
385 
386 /// A helper function that returns an integer or floating-point constant with
387 /// value C.
388 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
389   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
390                            : ConstantFP::get(Ty, C);
391 }
392 
393 /// Returns "best known" trip count for the specified loop \p L as defined by
394 /// the following procedure:
395 ///   1) Returns exact trip count if it is known.
396 ///   2) Returns expected trip count according to profile data if any.
397 ///   3) Returns upper bound estimate if it is known.
398 ///   4) Returns None if all of the above failed.
399 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
400   // Check if exact trip count is known.
401   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
402     return ExpectedTC;
403 
404   // Check if there is an expected trip count available from profile data.
405   if (LoopVectorizeWithBlockFrequency)
406     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
407       return EstimatedTC;
408 
409   // Check if upper bound estimate is known.
410   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
411     return ExpectedTC;
412 
413   return None;
414 }
415 
416 namespace llvm {
417 
418 /// InnerLoopVectorizer vectorizes loops which contain only one basic
419 /// block to a specified vectorization factor (VF).
420 /// This class performs the widening of scalars into vectors, or multiple
421 /// scalars. This class also implements the following features:
422 /// * It inserts an epilogue loop for handling loops that don't have iteration
423 ///   counts that are known to be a multiple of the vectorization factor.
424 /// * It handles the code generation for reduction variables.
425 /// * Scalarization (implementation using scalars) of un-vectorizable
426 ///   instructions.
427 /// InnerLoopVectorizer does not perform any vectorization-legality
428 /// checks, and relies on the caller to check for the different legality
429 /// aspects. The InnerLoopVectorizer relies on the
430 /// LoopVectorizationLegality class to provide information about the induction
431 /// and reduction variables that were found to a given vectorization factor.
432 class InnerLoopVectorizer {
433 public:
434   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
435                       LoopInfo *LI, DominatorTree *DT,
436                       const TargetLibraryInfo *TLI,
437                       const TargetTransformInfo *TTI, AssumptionCache *AC,
438                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
439                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
440                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
441                       ProfileSummaryInfo *PSI)
442       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
443         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
444         Builder(PSE.getSE()->getContext()),
445         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM),
446         BFI(BFI), PSI(PSI) {
447     // Query this against the original loop and save it here because the profile
448     // of the original loop header may change as the transformation happens.
449     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
450         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
451   }
452 
453   virtual ~InnerLoopVectorizer() = default;
454 
455   /// Create a new empty loop that will contain vectorized instructions later
456   /// on, while the old loop will be used as the scalar remainder. Control flow
457   /// is generated around the vectorized (and scalar epilogue) loops consisting
458   /// of various checks and bypasses. Return the pre-header block of the new
459   /// loop.
460   BasicBlock *createVectorizedLoopSkeleton();
461 
462   /// Widen a single instruction within the innermost loop.
463   void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands,
464                         VPTransformState &State);
465 
466   /// Widen a single call instruction within the innermost loop.
467   void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
468                             VPTransformState &State);
469 
470   /// Widen a single select instruction within the innermost loop.
471   void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands,
472                               bool InvariantCond, VPTransformState &State);
473 
474   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
475   void fixVectorizedLoop();
476 
477   // Return true if any runtime check is added.
478   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
479 
480   /// A type for vectorized values in the new loop. Each value from the
481   /// original loop, when vectorized, is represented by UF vector values in the
482   /// new unrolled loop, where UF is the unroll factor.
483   using VectorParts = SmallVector<Value *, 2>;
484 
485   /// Vectorize a single GetElementPtrInst based on information gathered and
486   /// decisions taken during planning.
487   void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices,
488                 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant,
489                 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
490 
491   /// Vectorize a single PHINode in a block. This method handles the induction
492   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
493   /// arbitrary length vectors.
494   void widenPHIInstruction(Instruction *PN, unsigned UF, ElementCount VF);
495 
496   /// A helper function to scalarize a single Instruction in the innermost loop.
497   /// Generates a sequence of scalar instances for each lane between \p MinLane
498   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
499   /// inclusive. Uses the VPValue operands from \p Operands instead of \p
500   /// Instr's operands.
501   void scalarizeInstruction(Instruction *Instr, VPUser &Operands,
502                             const VPIteration &Instance, bool IfPredicateInstr,
503                             VPTransformState &State);
504 
505   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
506   /// is provided, the integer induction variable will first be truncated to
507   /// the corresponding type.
508   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
509 
510   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
511   /// vector or scalar value on-demand if one is not yet available. When
512   /// vectorizing a loop, we visit the definition of an instruction before its
513   /// uses. When visiting the definition, we either vectorize or scalarize the
514   /// instruction, creating an entry for it in the corresponding map. (In some
515   /// cases, such as induction variables, we will create both vector and scalar
516   /// entries.) Then, as we encounter uses of the definition, we derive values
517   /// for each scalar or vector use unless such a value is already available.
518   /// For example, if we scalarize a definition and one of its uses is vector,
519   /// we build the required vector on-demand with an insertelement sequence
520   /// when visiting the use. Otherwise, if the use is scalar, we can use the
521   /// existing scalar definition.
522   ///
523   /// Return a value in the new loop corresponding to \p V from the original
524   /// loop at unroll index \p Part. If the value has already been vectorized,
525   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
526   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
527   /// a new vector value on-demand by inserting the scalar values into a vector
528   /// with an insertelement sequence. If the value has been neither vectorized
529   /// nor scalarized, it must be loop invariant, so we simply broadcast the
530   /// value into a vector.
531   Value *getOrCreateVectorValue(Value *V, unsigned Part);
532 
533   void setVectorValue(Value *Scalar, unsigned Part, Value *Vector) {
534     VectorLoopValueMap.setVectorValue(Scalar, Part, Vector);
535   }
536 
537   /// Return a value in the new loop corresponding to \p V from the original
538   /// loop at unroll and vector indices \p Instance. If the value has been
539   /// vectorized but not scalarized, the necessary extractelement instruction
540   /// will be generated.
541   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
542 
543   /// Construct the vector value of a scalarized value \p V one lane at a time.
544   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
545 
546   /// Try to vectorize interleaved access group \p Group with the base address
547   /// given in \p Addr, optionally masking the vector operations if \p
548   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
549   /// values in the vectorized loop.
550   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
551                                 VPTransformState &State, VPValue *Addr,
552                                 ArrayRef<VPValue *> StoredValues,
553                                 VPValue *BlockInMask = nullptr);
554 
555   /// Vectorize Load and Store instructions with the base address given in \p
556   /// Addr, optionally masking the vector operations if \p BlockInMask is
557   /// non-null. Use \p State to translate given VPValues to IR values in the
558   /// vectorized loop.
559   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
560                                   VPValue *Def, VPValue *Addr,
561                                   VPValue *StoredValue, VPValue *BlockInMask);
562 
563   /// Set the debug location in the builder using the debug location in
564   /// the instruction.
565   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
566 
567   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
568   void fixNonInductionPHIs(void);
569 
570 protected:
571   friend class LoopVectorizationPlanner;
572 
573   /// A small list of PHINodes.
574   using PhiVector = SmallVector<PHINode *, 4>;
575 
576   /// A type for scalarized values in the new loop. Each value from the
577   /// original loop, when scalarized, is represented by UF x VF scalar values
578   /// in the new unrolled loop, where UF is the unroll factor and VF is the
579   /// vectorization factor.
580   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
581 
582   /// Set up the values of the IVs correctly when exiting the vector loop.
583   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
584                     Value *CountRoundDown, Value *EndValue,
585                     BasicBlock *MiddleBlock);
586 
587   /// Create a new induction variable inside L.
588   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
589                                    Value *Step, Instruction *DL);
590 
591   /// Handle all cross-iteration phis in the header.
592   void fixCrossIterationPHIs();
593 
594   /// Fix a first-order recurrence. This is the second phase of vectorizing
595   /// this phi node.
596   void fixFirstOrderRecurrence(PHINode *Phi);
597 
598   /// Fix a reduction cross-iteration phi. This is the second phase of
599   /// vectorizing this phi node.
600   void fixReduction(PHINode *Phi);
601 
602   /// Clear NSW/NUW flags from reduction instructions if necessary.
603   void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
604 
605   /// The Loop exit block may have single value PHI nodes with some
606   /// incoming value. While vectorizing we only handled real values
607   /// that were defined inside the loop and we should have one value for
608   /// each predecessor of its parent basic block. See PR14725.
609   void fixLCSSAPHIs();
610 
611   /// Iteratively sink the scalarized operands of a predicated instruction into
612   /// the block that was created for it.
613   void sinkScalarOperands(Instruction *PredInst);
614 
615   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
616   /// represented as.
617   void truncateToMinimalBitwidths();
618 
619   /// Create a broadcast instruction. This method generates a broadcast
620   /// instruction (shuffle) for loop invariant values and for the induction
621   /// value. If this is the induction variable then we extend it to N, N+1, ...
622   /// this is needed because each iteration in the loop corresponds to a SIMD
623   /// element.
624   virtual Value *getBroadcastInstrs(Value *V);
625 
626   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
627   /// to each vector element of Val. The sequence starts at StartIndex.
628   /// \p Opcode is relevant for FP induction variable.
629   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
630                                Instruction::BinaryOps Opcode =
631                                Instruction::BinaryOpsEnd);
632 
633   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
634   /// variable on which to base the steps, \p Step is the size of the step, and
635   /// \p EntryVal is the value from the original loop that maps to the steps.
636   /// Note that \p EntryVal doesn't have to be an induction variable - it
637   /// can also be a truncate instruction.
638   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
639                         const InductionDescriptor &ID);
640 
641   /// Create a vector induction phi node based on an existing scalar one. \p
642   /// EntryVal is the value from the original loop that maps to the vector phi
643   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
644   /// truncate instruction, instead of widening the original IV, we widen a
645   /// version of the IV truncated to \p EntryVal's type.
646   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
647                                        Value *Step, Instruction *EntryVal);
648 
649   /// Returns true if an instruction \p I should be scalarized instead of
650   /// vectorized for the chosen vectorization factor.
651   bool shouldScalarizeInstruction(Instruction *I) const;
652 
653   /// Returns true if we should generate a scalar version of \p IV.
654   bool needsScalarInduction(Instruction *IV) const;
655 
656   /// If there is a cast involved in the induction variable \p ID, which should
657   /// be ignored in the vectorized loop body, this function records the
658   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
659   /// cast. We had already proved that the casted Phi is equal to the uncasted
660   /// Phi in the vectorized loop (under a runtime guard), and therefore
661   /// there is no need to vectorize the cast - the same value can be used in the
662   /// vector loop for both the Phi and the cast.
663   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
664   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
665   ///
666   /// \p EntryVal is the value from the original loop that maps to the vector
667   /// phi node and is used to distinguish what is the IV currently being
668   /// processed - original one (if \p EntryVal is a phi corresponding to the
669   /// original IV) or the "newly-created" one based on the proof mentioned above
670   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
671   /// latter case \p EntryVal is a TruncInst and we must not record anything for
672   /// that IV, but it's error-prone to expect callers of this routine to care
673   /// about that, hence this explicit parameter.
674   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
675                                              const Instruction *EntryVal,
676                                              Value *VectorLoopValue,
677                                              unsigned Part,
678                                              unsigned Lane = UINT_MAX);
679 
680   /// Generate a shuffle sequence that will reverse the vector Vec.
681   virtual Value *reverseVector(Value *Vec);
682 
683   /// Returns (and creates if needed) the original loop trip count.
684   Value *getOrCreateTripCount(Loop *NewLoop);
685 
686   /// Returns (and creates if needed) the trip count of the widened loop.
687   Value *getOrCreateVectorTripCount(Loop *NewLoop);
688 
689   /// Returns a bitcasted value to the requested vector type.
690   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
691   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
692                                 const DataLayout &DL);
693 
694   /// Emit a bypass check to see if the vector trip count is zero, including if
695   /// it overflows.
696   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
697 
698   /// Emit a bypass check to see if all of the SCEV assumptions we've
699   /// had to make are correct.
700   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
701 
702   /// Emit bypass checks to check any memory assumptions we may have made.
703   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
704 
705   /// Compute the transformed value of Index at offset StartValue using step
706   /// StepValue.
707   /// For integer induction, returns StartValue + Index * StepValue.
708   /// For pointer induction, returns StartValue[Index * StepValue].
709   /// FIXME: The newly created binary instructions should contain nsw/nuw
710   /// flags, which can be found from the original scalar operations.
711   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
712                               const DataLayout &DL,
713                               const InductionDescriptor &ID) const;
714 
715   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
716   /// vector loop preheader, middle block and scalar preheader. Also
717   /// allocate a loop object for the new vector loop and return it.
718   Loop *createVectorLoopSkeleton(StringRef Prefix);
719 
720   /// Create new phi nodes for the induction variables to resume iteration count
721   /// in the scalar epilogue, from where the vectorized loop left off (given by
722   /// \p VectorTripCount).
723   void createInductionResumeValues(Loop *L, Value *VectorTripCount);
724 
725   /// Complete the loop skeleton by adding debug MDs, creating appropriate
726   /// conditional branches in the middle block, preparing the builder and
727   /// running the verifier. Take in the vector loop \p L as argument, and return
728   /// the preheader of the completed vector loop.
729   BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
730 
731   /// Add additional metadata to \p To that was not present on \p Orig.
732   ///
733   /// Currently this is used to add the noalias annotations based on the
734   /// inserted memchecks.  Use this for instructions that are *cloned* into the
735   /// vector loop.
736   void addNewMetadata(Instruction *To, const Instruction *Orig);
737 
738   /// Add metadata from one instruction to another.
739   ///
740   /// This includes both the original MDs from \p From and additional ones (\see
741   /// addNewMetadata).  Use this for *newly created* instructions in the vector
742   /// loop.
743   void addMetadata(Instruction *To, Instruction *From);
744 
745   /// Similar to the previous function but it adds the metadata to a
746   /// vector of instructions.
747   void addMetadata(ArrayRef<Value *> To, Instruction *From);
748 
749   /// The original loop.
750   Loop *OrigLoop;
751 
752   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
753   /// dynamic knowledge to simplify SCEV expressions and converts them to a
754   /// more usable form.
755   PredicatedScalarEvolution &PSE;
756 
757   /// Loop Info.
758   LoopInfo *LI;
759 
760   /// Dominator Tree.
761   DominatorTree *DT;
762 
763   /// Alias Analysis.
764   AAResults *AA;
765 
766   /// Target Library Info.
767   const TargetLibraryInfo *TLI;
768 
769   /// Target Transform Info.
770   const TargetTransformInfo *TTI;
771 
772   /// Assumption Cache.
773   AssumptionCache *AC;
774 
775   /// Interface to emit optimization remarks.
776   OptimizationRemarkEmitter *ORE;
777 
778   /// LoopVersioning.  It's only set up (non-null) if memchecks were
779   /// used.
780   ///
781   /// This is currently only used to add no-alias metadata based on the
782   /// memchecks.  The actually versioning is performed manually.
783   std::unique_ptr<LoopVersioning> LVer;
784 
785   /// The vectorization SIMD factor to use. Each vector will have this many
786   /// vector elements.
787   ElementCount VF;
788 
789   /// The vectorization unroll factor to use. Each scalar is vectorized to this
790   /// many different vector instructions.
791   unsigned UF;
792 
793   /// The builder that we use
794   IRBuilder<> Builder;
795 
796   // --- Vectorization state ---
797 
798   /// The vector-loop preheader.
799   BasicBlock *LoopVectorPreHeader;
800 
801   /// The scalar-loop preheader.
802   BasicBlock *LoopScalarPreHeader;
803 
804   /// Middle Block between the vector and the scalar.
805   BasicBlock *LoopMiddleBlock;
806 
807   /// The ExitBlock of the scalar loop.
808   BasicBlock *LoopExitBlock;
809 
810   /// The vector loop body.
811   BasicBlock *LoopVectorBody;
812 
813   /// The scalar loop body.
814   BasicBlock *LoopScalarBody;
815 
816   /// A list of all bypass blocks. The first block is the entry of the loop.
817   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
818 
819   /// The new Induction variable which was added to the new block.
820   PHINode *Induction = nullptr;
821 
822   /// The induction variable of the old basic block.
823   PHINode *OldInduction = nullptr;
824 
825   /// Maps values from the original loop to their corresponding values in the
826   /// vectorized loop. A key value can map to either vector values, scalar
827   /// values or both kinds of values, depending on whether the key was
828   /// vectorized and scalarized.
829   VectorizerValueMap VectorLoopValueMap;
830 
831   /// Store instructions that were predicated.
832   SmallVector<Instruction *, 4> PredicatedInstructions;
833 
834   /// Trip count of the original loop.
835   Value *TripCount = nullptr;
836 
837   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
838   Value *VectorTripCount = nullptr;
839 
840   /// The legality analysis.
841   LoopVectorizationLegality *Legal;
842 
843   /// The profitablity analysis.
844   LoopVectorizationCostModel *Cost;
845 
846   // Record whether runtime checks are added.
847   bool AddedSafetyChecks = false;
848 
849   // Holds the end values for each induction variable. We save the end values
850   // so we can later fix-up the external users of the induction variables.
851   DenseMap<PHINode *, Value *> IVEndValues;
852 
853   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
854   // fixed up at the end of vector code generation.
855   SmallVector<PHINode *, 8> OrigPHIsToFix;
856 
857   /// BFI and PSI are used to check for profile guided size optimizations.
858   BlockFrequencyInfo *BFI;
859   ProfileSummaryInfo *PSI;
860 
861   // Whether this loop should be optimized for size based on profile guided size
862   // optimizatios.
863   bool OptForSizeBasedOnProfile;
864 };
865 
866 class InnerLoopUnroller : public InnerLoopVectorizer {
867 public:
868   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
869                     LoopInfo *LI, DominatorTree *DT,
870                     const TargetLibraryInfo *TLI,
871                     const TargetTransformInfo *TTI, AssumptionCache *AC,
872                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
873                     LoopVectorizationLegality *LVL,
874                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
875                     ProfileSummaryInfo *PSI)
876       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
877                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
878                             BFI, PSI) {}
879 
880 private:
881   Value *getBroadcastInstrs(Value *V) override;
882   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
883                        Instruction::BinaryOps Opcode =
884                        Instruction::BinaryOpsEnd) override;
885   Value *reverseVector(Value *Vec) override;
886 };
887 
888 } // end namespace llvm
889 
890 /// Look for a meaningful debug location on the instruction or it's
891 /// operands.
892 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
893   if (!I)
894     return I;
895 
896   DebugLoc Empty;
897   if (I->getDebugLoc() != Empty)
898     return I;
899 
900   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
901     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
902       if (OpInst->getDebugLoc() != Empty)
903         return OpInst;
904   }
905 
906   return I;
907 }
908 
909 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
910   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
911     const DILocation *DIL = Inst->getDebugLoc();
912     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
913         !isa<DbgInfoIntrinsic>(Inst)) {
914       assert(!VF.isScalable() && "scalable vectors not yet supported.");
915       auto NewDIL =
916           DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
917       if (NewDIL)
918         B.SetCurrentDebugLocation(NewDIL.getValue());
919       else
920         LLVM_DEBUG(dbgs()
921                    << "Failed to create new discriminator: "
922                    << DIL->getFilename() << " Line: " << DIL->getLine());
923     }
924     else
925       B.SetCurrentDebugLocation(DIL);
926   } else
927     B.SetCurrentDebugLocation(DebugLoc());
928 }
929 
930 /// Write a record \p DebugMsg about vectorization failure to the debug
931 /// output stream. If \p I is passed, it is an instruction that prevents
932 /// vectorization.
933 #ifndef NDEBUG
934 static void debugVectorizationFailure(const StringRef DebugMsg,
935     Instruction *I) {
936   dbgs() << "LV: Not vectorizing: " << DebugMsg;
937   if (I != nullptr)
938     dbgs() << " " << *I;
939   else
940     dbgs() << '.';
941   dbgs() << '\n';
942 }
943 #endif
944 
945 /// Create an analysis remark that explains why vectorization failed
946 ///
947 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
948 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
949 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
950 /// the location of the remark.  \return the remark object that can be
951 /// streamed to.
952 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
953     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
954   Value *CodeRegion = TheLoop->getHeader();
955   DebugLoc DL = TheLoop->getStartLoc();
956 
957   if (I) {
958     CodeRegion = I->getParent();
959     // If there is no debug location attached to the instruction, revert back to
960     // using the loop's.
961     if (I->getDebugLoc())
962       DL = I->getDebugLoc();
963   }
964 
965   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
966   R << "loop not vectorized: ";
967   return R;
968 }
969 
970 namespace llvm {
971 
972 void reportVectorizationFailure(const StringRef DebugMsg,
973     const StringRef OREMsg, const StringRef ORETag,
974     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
975   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
976   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
977   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
978                 ORETag, TheLoop, I) << OREMsg);
979 }
980 
981 } // end namespace llvm
982 
983 #ifndef NDEBUG
984 /// \return string containing a file name and a line # for the given loop.
985 static std::string getDebugLocString(const Loop *L) {
986   std::string Result;
987   if (L) {
988     raw_string_ostream OS(Result);
989     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
990       LoopDbgLoc.print(OS);
991     else
992       // Just print the module name.
993       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
994     OS.flush();
995   }
996   return Result;
997 }
998 #endif
999 
1000 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
1001                                          const Instruction *Orig) {
1002   // If the loop was versioned with memchecks, add the corresponding no-alias
1003   // metadata.
1004   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
1005     LVer->annotateInstWithNoAlias(To, Orig);
1006 }
1007 
1008 void InnerLoopVectorizer::addMetadata(Instruction *To,
1009                                       Instruction *From) {
1010   propagateMetadata(To, From);
1011   addNewMetadata(To, From);
1012 }
1013 
1014 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1015                                       Instruction *From) {
1016   for (Value *V : To) {
1017     if (Instruction *I = dyn_cast<Instruction>(V))
1018       addMetadata(I, From);
1019   }
1020 }
1021 
1022 namespace llvm {
1023 
1024 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1025 // lowered.
1026 enum ScalarEpilogueLowering {
1027 
1028   // The default: allowing scalar epilogues.
1029   CM_ScalarEpilogueAllowed,
1030 
1031   // Vectorization with OptForSize: don't allow epilogues.
1032   CM_ScalarEpilogueNotAllowedOptSize,
1033 
1034   // A special case of vectorisation with OptForSize: loops with a very small
1035   // trip count are considered for vectorization under OptForSize, thereby
1036   // making sure the cost of their loop body is dominant, free of runtime
1037   // guards and scalar iteration overheads.
1038   CM_ScalarEpilogueNotAllowedLowTripLoop,
1039 
1040   // Loop hint predicate indicating an epilogue is undesired.
1041   CM_ScalarEpilogueNotNeededUsePredicate
1042 };
1043 
1044 /// LoopVectorizationCostModel - estimates the expected speedups due to
1045 /// vectorization.
1046 /// In many cases vectorization is not profitable. This can happen because of
1047 /// a number of reasons. In this class we mainly attempt to predict the
1048 /// expected speedup/slowdowns due to the supported instruction set. We use the
1049 /// TargetTransformInfo to query the different backends for the cost of
1050 /// different operations.
1051 class LoopVectorizationCostModel {
1052 public:
1053   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1054                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1055                              LoopVectorizationLegality *Legal,
1056                              const TargetTransformInfo &TTI,
1057                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1058                              AssumptionCache *AC,
1059                              OptimizationRemarkEmitter *ORE, const Function *F,
1060                              const LoopVectorizeHints *Hints,
1061                              InterleavedAccessInfo &IAI)
1062       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1063         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1064         Hints(Hints), InterleaveInfo(IAI) {}
1065 
1066   /// \return An upper bound for the vectorization factor, or None if
1067   /// vectorization and interleaving should be avoided up front.
1068   Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC);
1069 
1070   /// \return True if runtime checks are required for vectorization, and false
1071   /// otherwise.
1072   bool runtimeChecksRequired();
1073 
1074   /// \return The most profitable vectorization factor and the cost of that VF.
1075   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
1076   /// then this vectorization factor will be selected if vectorization is
1077   /// possible.
1078   VectorizationFactor selectVectorizationFactor(ElementCount MaxVF);
1079 
1080   /// Setup cost-based decisions for user vectorization factor.
1081   void selectUserVectorizationFactor(ElementCount UserVF) {
1082     collectUniformsAndScalars(UserVF);
1083     collectInstsToScalarize(UserVF);
1084   }
1085 
1086   /// \return The size (in bits) of the smallest and widest types in the code
1087   /// that needs to be vectorized. We ignore values that remain scalar such as
1088   /// 64 bit loop indices.
1089   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1090 
1091   /// \return The desired interleave count.
1092   /// If interleave count has been specified by metadata it will be returned.
1093   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1094   /// are the selected vectorization factor and the cost of the selected VF.
1095   unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1096 
1097   /// Memory access instruction may be vectorized in more than one way.
1098   /// Form of instruction after vectorization depends on cost.
1099   /// This function takes cost-based decisions for Load/Store instructions
1100   /// and collects them in a map. This decisions map is used for building
1101   /// the lists of loop-uniform and loop-scalar instructions.
1102   /// The calculated cost is saved with widening decision in order to
1103   /// avoid redundant calculations.
1104   void setCostBasedWideningDecision(ElementCount VF);
1105 
1106   /// A struct that represents some properties of the register usage
1107   /// of a loop.
1108   struct RegisterUsage {
1109     /// Holds the number of loop invariant values that are used in the loop.
1110     /// The key is ClassID of target-provided register class.
1111     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1112     /// Holds the maximum number of concurrent live intervals in the loop.
1113     /// The key is ClassID of target-provided register class.
1114     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1115   };
1116 
1117   /// \return Returns information about the register usages of the loop for the
1118   /// given vectorization factors.
1119   SmallVector<RegisterUsage, 8>
1120   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1121 
1122   /// Collect values we want to ignore in the cost model.
1123   void collectValuesToIgnore();
1124 
1125   /// Split reductions into those that happen in the loop, and those that happen
1126   /// outside. In loop reductions are collected into InLoopReductionChains.
1127   void collectInLoopReductions();
1128 
1129   /// \returns The smallest bitwidth each instruction can be represented with.
1130   /// The vector equivalents of these instructions should be truncated to this
1131   /// type.
1132   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1133     return MinBWs;
1134   }
1135 
1136   /// \returns True if it is more profitable to scalarize instruction \p I for
1137   /// vectorization factor \p VF.
1138   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1139     assert(VF.isVector() &&
1140            "Profitable to scalarize relevant only for VF > 1.");
1141 
1142     // Cost model is not run in the VPlan-native path - return conservative
1143     // result until this changes.
1144     if (EnableVPlanNativePath)
1145       return false;
1146 
1147     auto Scalars = InstsToScalarize.find(VF);
1148     assert(Scalars != InstsToScalarize.end() &&
1149            "VF not yet analyzed for scalarization profitability");
1150     return Scalars->second.find(I) != Scalars->second.end();
1151   }
1152 
1153   /// Returns true if \p I is known to be uniform after vectorization.
1154   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1155     if (VF.isScalar())
1156       return true;
1157 
1158     // Cost model is not run in the VPlan-native path - return conservative
1159     // result until this changes.
1160     if (EnableVPlanNativePath)
1161       return false;
1162 
1163     auto UniformsPerVF = Uniforms.find(VF);
1164     assert(UniformsPerVF != Uniforms.end() &&
1165            "VF not yet analyzed for uniformity");
1166     return UniformsPerVF->second.count(I);
1167   }
1168 
1169   /// Returns true if \p I is known to be scalar after vectorization.
1170   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1171     if (VF.isScalar())
1172       return true;
1173 
1174     // Cost model is not run in the VPlan-native path - return conservative
1175     // result until this changes.
1176     if (EnableVPlanNativePath)
1177       return false;
1178 
1179     auto ScalarsPerVF = Scalars.find(VF);
1180     assert(ScalarsPerVF != Scalars.end() &&
1181            "Scalar values are not calculated for VF");
1182     return ScalarsPerVF->second.count(I);
1183   }
1184 
1185   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1186   /// for vectorization factor \p VF.
1187   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1188     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1189            !isProfitableToScalarize(I, VF) &&
1190            !isScalarAfterVectorization(I, VF);
1191   }
1192 
1193   /// Decision that was taken during cost calculation for memory instruction.
1194   enum InstWidening {
1195     CM_Unknown,
1196     CM_Widen,         // For consecutive accesses with stride +1.
1197     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1198     CM_Interleave,
1199     CM_GatherScatter,
1200     CM_Scalarize
1201   };
1202 
1203   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1204   /// instruction \p I and vector width \p VF.
1205   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1206                            unsigned Cost) {
1207     assert(VF.isVector() && "Expected VF >=2");
1208     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1209   }
1210 
1211   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1212   /// interleaving group \p Grp and vector width \p VF.
1213   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1214                            ElementCount VF, InstWidening W, unsigned Cost) {
1215     assert(VF.isVector() && "Expected VF >=2");
1216     /// Broadcast this decicion to all instructions inside the group.
1217     /// But the cost will be assigned to one instruction only.
1218     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1219       if (auto *I = Grp->getMember(i)) {
1220         if (Grp->getInsertPos() == I)
1221           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1222         else
1223           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1224       }
1225     }
1226   }
1227 
1228   /// Return the cost model decision for the given instruction \p I and vector
1229   /// width \p VF. Return CM_Unknown if this instruction did not pass
1230   /// through the cost modeling.
1231   InstWidening getWideningDecision(Instruction *I, ElementCount VF) {
1232     assert(!VF.isScalable() && "scalable vectors not yet supported.");
1233     assert(VF.isVector() && "Expected VF >=2");
1234 
1235     // Cost model is not run in the VPlan-native path - return conservative
1236     // result until this changes.
1237     if (EnableVPlanNativePath)
1238       return CM_GatherScatter;
1239 
1240     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1241     auto Itr = WideningDecisions.find(InstOnVF);
1242     if (Itr == WideningDecisions.end())
1243       return CM_Unknown;
1244     return Itr->second.first;
1245   }
1246 
1247   /// Return the vectorization cost for the given instruction \p I and vector
1248   /// width \p VF.
1249   unsigned getWideningCost(Instruction *I, ElementCount VF) {
1250     assert(VF.isVector() && "Expected VF >=2");
1251     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1252     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1253            "The cost is not calculated");
1254     return WideningDecisions[InstOnVF].second;
1255   }
1256 
1257   /// Return True if instruction \p I is an optimizable truncate whose operand
1258   /// is an induction variable. Such a truncate will be removed by adding a new
1259   /// induction variable with the destination type.
1260   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1261     // If the instruction is not a truncate, return false.
1262     auto *Trunc = dyn_cast<TruncInst>(I);
1263     if (!Trunc)
1264       return false;
1265 
1266     // Get the source and destination types of the truncate.
1267     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1268     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1269 
1270     // If the truncate is free for the given types, return false. Replacing a
1271     // free truncate with an induction variable would add an induction variable
1272     // update instruction to each iteration of the loop. We exclude from this
1273     // check the primary induction variable since it will need an update
1274     // instruction regardless.
1275     Value *Op = Trunc->getOperand(0);
1276     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1277       return false;
1278 
1279     // If the truncated value is not an induction variable, return false.
1280     return Legal->isInductionPhi(Op);
1281   }
1282 
1283   /// Collects the instructions to scalarize for each predicated instruction in
1284   /// the loop.
1285   void collectInstsToScalarize(ElementCount VF);
1286 
1287   /// Collect Uniform and Scalar values for the given \p VF.
1288   /// The sets depend on CM decision for Load/Store instructions
1289   /// that may be vectorized as interleave, gather-scatter or scalarized.
1290   void collectUniformsAndScalars(ElementCount VF) {
1291     // Do the analysis once.
1292     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1293       return;
1294     setCostBasedWideningDecision(VF);
1295     collectLoopUniforms(VF);
1296     collectLoopScalars(VF);
1297   }
1298 
1299   /// Returns true if the target machine supports masked store operation
1300   /// for the given \p DataType and kind of access to \p Ptr.
1301   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) {
1302     return Legal->isConsecutivePtr(Ptr) &&
1303            TTI.isLegalMaskedStore(DataType, Alignment);
1304   }
1305 
1306   /// Returns true if the target machine supports masked load operation
1307   /// for the given \p DataType and kind of access to \p Ptr.
1308   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) {
1309     return Legal->isConsecutivePtr(Ptr) &&
1310            TTI.isLegalMaskedLoad(DataType, Alignment);
1311   }
1312 
1313   /// Returns true if the target machine supports masked scatter operation
1314   /// for the given \p DataType.
1315   bool isLegalMaskedScatter(Type *DataType, Align Alignment) {
1316     return TTI.isLegalMaskedScatter(DataType, Alignment);
1317   }
1318 
1319   /// Returns true if the target machine supports masked gather operation
1320   /// for the given \p DataType.
1321   bool isLegalMaskedGather(Type *DataType, Align Alignment) {
1322     return TTI.isLegalMaskedGather(DataType, Alignment);
1323   }
1324 
1325   /// Returns true if the target machine can represent \p V as a masked gather
1326   /// or scatter operation.
1327   bool isLegalGatherOrScatter(Value *V) {
1328     bool LI = isa<LoadInst>(V);
1329     bool SI = isa<StoreInst>(V);
1330     if (!LI && !SI)
1331       return false;
1332     auto *Ty = getMemInstValueType(V);
1333     Align Align = getLoadStoreAlignment(V);
1334     return (LI && isLegalMaskedGather(Ty, Align)) ||
1335            (SI && isLegalMaskedScatter(Ty, Align));
1336   }
1337 
1338   /// Returns true if \p I is an instruction that will be scalarized with
1339   /// predication. Such instructions include conditional stores and
1340   /// instructions that may divide by zero.
1341   /// If a non-zero VF has been calculated, we check if I will be scalarized
1342   /// predication for that VF.
1343   bool isScalarWithPredication(Instruction *I,
1344                                ElementCount VF = ElementCount::getFixed(1));
1345 
1346   // Returns true if \p I is an instruction that will be predicated either
1347   // through scalar predication or masked load/store or masked gather/scatter.
1348   // Superset of instructions that return true for isScalarWithPredication.
1349   bool isPredicatedInst(Instruction *I) {
1350     if (!blockNeedsPredication(I->getParent()))
1351       return false;
1352     // Loads and stores that need some form of masked operation are predicated
1353     // instructions.
1354     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1355       return Legal->isMaskRequired(I);
1356     return isScalarWithPredication(I);
1357   }
1358 
1359   /// Returns true if \p I is a memory instruction with consecutive memory
1360   /// access that can be widened.
1361   bool
1362   memoryInstructionCanBeWidened(Instruction *I,
1363                                 ElementCount VF = ElementCount::getFixed(1));
1364 
1365   /// Returns true if \p I is a memory instruction in an interleaved-group
1366   /// of memory accesses that can be vectorized with wide vector loads/stores
1367   /// and shuffles.
1368   bool
1369   interleavedAccessCanBeWidened(Instruction *I,
1370                                 ElementCount VF = ElementCount::getFixed(1));
1371 
1372   /// Check if \p Instr belongs to any interleaved access group.
1373   bool isAccessInterleaved(Instruction *Instr) {
1374     return InterleaveInfo.isInterleaved(Instr);
1375   }
1376 
1377   /// Get the interleaved access group that \p Instr belongs to.
1378   const InterleaveGroup<Instruction> *
1379   getInterleavedAccessGroup(Instruction *Instr) {
1380     return InterleaveInfo.getInterleaveGroup(Instr);
1381   }
1382 
1383   /// Returns true if an interleaved group requires a scalar iteration
1384   /// to handle accesses with gaps, and there is nothing preventing us from
1385   /// creating a scalar epilogue.
1386   bool requiresScalarEpilogue() const {
1387     return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1388   }
1389 
1390   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1391   /// loop hint annotation.
1392   bool isScalarEpilogueAllowed() const {
1393     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1394   }
1395 
1396   /// Returns true if all loop blocks should be masked to fold tail loop.
1397   bool foldTailByMasking() const { return FoldTailByMasking; }
1398 
1399   bool blockNeedsPredication(BasicBlock *BB) {
1400     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1401   }
1402 
1403   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1404   /// nodes to the chain of instructions representing the reductions. Uses a
1405   /// MapVector to ensure deterministic iteration order.
1406   using ReductionChainMap =
1407       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1408 
1409   /// Return the chain of instructions representing an inloop reduction.
1410   const ReductionChainMap &getInLoopReductionChains() const {
1411     return InLoopReductionChains;
1412   }
1413 
1414   /// Returns true if the Phi is part of an inloop reduction.
1415   bool isInLoopReduction(PHINode *Phi) const {
1416     return InLoopReductionChains.count(Phi);
1417   }
1418 
1419   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1420   /// with factor VF.  Return the cost of the instruction, including
1421   /// scalarization overhead if it's needed.
1422   unsigned getVectorIntrinsicCost(CallInst *CI, ElementCount VF);
1423 
1424   /// Estimate cost of a call instruction CI if it were vectorized with factor
1425   /// VF. Return the cost of the instruction, including scalarization overhead
1426   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1427   /// scalarized -
1428   /// i.e. either vector version isn't available, or is too expensive.
1429   unsigned getVectorCallCost(CallInst *CI, ElementCount VF,
1430                              bool &NeedToScalarize);
1431 
1432   /// Invalidates decisions already taken by the cost model.
1433   void invalidateCostModelingDecisions() {
1434     WideningDecisions.clear();
1435     Uniforms.clear();
1436     Scalars.clear();
1437   }
1438 
1439 private:
1440   unsigned NumPredStores = 0;
1441 
1442   /// \return An upper bound for the vectorization factor, a power-of-2 larger
1443   /// than zero. One is returned if vectorization should best be avoided due
1444   /// to cost.
1445   ElementCount computeFeasibleMaxVF(unsigned ConstTripCount,
1446                                     ElementCount UserVF);
1447 
1448   /// The vectorization cost is a combination of the cost itself and a boolean
1449   /// indicating whether any of the contributing operations will actually
1450   /// operate on
1451   /// vector values after type legalization in the backend. If this latter value
1452   /// is
1453   /// false, then all operations will be scalarized (i.e. no vectorization has
1454   /// actually taken place).
1455   using VectorizationCostTy = std::pair<unsigned, bool>;
1456 
1457   /// Returns the expected execution cost. The unit of the cost does
1458   /// not matter because we use the 'cost' units to compare different
1459   /// vector widths. The cost that is returned is *not* normalized by
1460   /// the factor width.
1461   VectorizationCostTy expectedCost(ElementCount VF);
1462 
1463   /// Returns the execution time cost of an instruction for a given vector
1464   /// width. Vector width of one means scalar.
1465   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1466 
1467   /// The cost-computation logic from getInstructionCost which provides
1468   /// the vector type as an output parameter.
1469   unsigned getInstructionCost(Instruction *I, ElementCount VF, Type *&VectorTy);
1470 
1471   /// Calculate vectorization cost of memory instruction \p I.
1472   unsigned getMemoryInstructionCost(Instruction *I, ElementCount VF);
1473 
1474   /// The cost computation for scalarized memory instruction.
1475   unsigned getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1476 
1477   /// The cost computation for interleaving group of memory instructions.
1478   unsigned getInterleaveGroupCost(Instruction *I, ElementCount VF);
1479 
1480   /// The cost computation for Gather/Scatter instruction.
1481   unsigned getGatherScatterCost(Instruction *I, ElementCount VF);
1482 
1483   /// The cost computation for widening instruction \p I with consecutive
1484   /// memory access.
1485   unsigned getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1486 
1487   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1488   /// Load: scalar load + broadcast.
1489   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1490   /// element)
1491   unsigned getUniformMemOpCost(Instruction *I, ElementCount VF);
1492 
1493   /// Estimate the overhead of scalarizing an instruction. This is a
1494   /// convenience wrapper for the type-based getScalarizationOverhead API.
1495   unsigned getScalarizationOverhead(Instruction *I, ElementCount VF);
1496 
1497   /// Returns whether the instruction is a load or store and will be a emitted
1498   /// as a vector operation.
1499   bool isConsecutiveLoadOrStore(Instruction *I);
1500 
1501   /// Returns true if an artificially high cost for emulated masked memrefs
1502   /// should be used.
1503   bool useEmulatedMaskMemRefHack(Instruction *I);
1504 
1505   /// Map of scalar integer values to the smallest bitwidth they can be legally
1506   /// represented as. The vector equivalents of these values should be truncated
1507   /// to this type.
1508   MapVector<Instruction *, uint64_t> MinBWs;
1509 
1510   /// A type representing the costs for instructions if they were to be
1511   /// scalarized rather than vectorized. The entries are Instruction-Cost
1512   /// pairs.
1513   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1514 
1515   /// A set containing all BasicBlocks that are known to present after
1516   /// vectorization as a predicated block.
1517   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1518 
1519   /// Records whether it is allowed to have the original scalar loop execute at
1520   /// least once. This may be needed as a fallback loop in case runtime
1521   /// aliasing/dependence checks fail, or to handle the tail/remainder
1522   /// iterations when the trip count is unknown or doesn't divide by the VF,
1523   /// or as a peel-loop to handle gaps in interleave-groups.
1524   /// Under optsize and when the trip count is very small we don't allow any
1525   /// iterations to execute in the scalar loop.
1526   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1527 
1528   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1529   bool FoldTailByMasking = false;
1530 
1531   /// A map holding scalar costs for different vectorization factors. The
1532   /// presence of a cost for an instruction in the mapping indicates that the
1533   /// instruction will be scalarized when vectorizing with the associated
1534   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1535   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1536 
1537   /// Holds the instructions known to be uniform after vectorization.
1538   /// The data is collected per VF.
1539   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1540 
1541   /// Holds the instructions known to be scalar after vectorization.
1542   /// The data is collected per VF.
1543   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1544 
1545   /// Holds the instructions (address computations) that are forced to be
1546   /// scalarized.
1547   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1548 
1549   /// PHINodes of the reductions that should be expanded in-loop along with
1550   /// their associated chains of reduction operations, in program order from top
1551   /// (PHI) to bottom
1552   ReductionChainMap InLoopReductionChains;
1553 
1554   /// Returns the expected difference in cost from scalarizing the expression
1555   /// feeding a predicated instruction \p PredInst. The instructions to
1556   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1557   /// non-negative return value implies the expression will be scalarized.
1558   /// Currently, only single-use chains are considered for scalarization.
1559   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1560                               ElementCount VF);
1561 
1562   /// Collect the instructions that are uniform after vectorization. An
1563   /// instruction is uniform if we represent it with a single scalar value in
1564   /// the vectorized loop corresponding to each vector iteration. Examples of
1565   /// uniform instructions include pointer operands of consecutive or
1566   /// interleaved memory accesses. Note that although uniformity implies an
1567   /// instruction will be scalar, the reverse is not true. In general, a
1568   /// scalarized instruction will be represented by VF scalar values in the
1569   /// vectorized loop, each corresponding to an iteration of the original
1570   /// scalar loop.
1571   void collectLoopUniforms(ElementCount VF);
1572 
1573   /// Collect the instructions that are scalar after vectorization. An
1574   /// instruction is scalar if it is known to be uniform or will be scalarized
1575   /// during vectorization. Non-uniform scalarized instructions will be
1576   /// represented by VF values in the vectorized loop, each corresponding to an
1577   /// iteration of the original scalar loop.
1578   void collectLoopScalars(ElementCount VF);
1579 
1580   /// Keeps cost model vectorization decision and cost for instructions.
1581   /// Right now it is used for memory instructions only.
1582   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1583                                 std::pair<InstWidening, unsigned>>;
1584 
1585   DecisionList WideningDecisions;
1586 
1587   /// Returns true if \p V is expected to be vectorized and it needs to be
1588   /// extracted.
1589   bool needsExtract(Value *V, ElementCount VF) const {
1590     Instruction *I = dyn_cast<Instruction>(V);
1591     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1592         TheLoop->isLoopInvariant(I))
1593       return false;
1594 
1595     // Assume we can vectorize V (and hence we need extraction) if the
1596     // scalars are not computed yet. This can happen, because it is called
1597     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1598     // the scalars are collected. That should be a safe assumption in most
1599     // cases, because we check if the operands have vectorizable types
1600     // beforehand in LoopVectorizationLegality.
1601     return Scalars.find(VF) == Scalars.end() ||
1602            !isScalarAfterVectorization(I, VF);
1603   };
1604 
1605   /// Returns a range containing only operands needing to be extracted.
1606   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1607                                                    ElementCount VF) {
1608     return SmallVector<Value *, 4>(make_filter_range(
1609         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1610   }
1611 
1612 public:
1613   /// The loop that we evaluate.
1614   Loop *TheLoop;
1615 
1616   /// Predicated scalar evolution analysis.
1617   PredicatedScalarEvolution &PSE;
1618 
1619   /// Loop Info analysis.
1620   LoopInfo *LI;
1621 
1622   /// Vectorization legality.
1623   LoopVectorizationLegality *Legal;
1624 
1625   /// Vector target information.
1626   const TargetTransformInfo &TTI;
1627 
1628   /// Target Library Info.
1629   const TargetLibraryInfo *TLI;
1630 
1631   /// Demanded bits analysis.
1632   DemandedBits *DB;
1633 
1634   /// Assumption cache.
1635   AssumptionCache *AC;
1636 
1637   /// Interface to emit optimization remarks.
1638   OptimizationRemarkEmitter *ORE;
1639 
1640   const Function *TheFunction;
1641 
1642   /// Loop Vectorize Hint.
1643   const LoopVectorizeHints *Hints;
1644 
1645   /// The interleave access information contains groups of interleaved accesses
1646   /// with the same stride and close to each other.
1647   InterleavedAccessInfo &InterleaveInfo;
1648 
1649   /// Values to ignore in the cost model.
1650   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1651 
1652   /// Values to ignore in the cost model when VF > 1.
1653   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1654 };
1655 
1656 } // end namespace llvm
1657 
1658 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1659 // vectorization. The loop needs to be annotated with #pragma omp simd
1660 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1661 // vector length information is not provided, vectorization is not considered
1662 // explicit. Interleave hints are not allowed either. These limitations will be
1663 // relaxed in the future.
1664 // Please, note that we are currently forced to abuse the pragma 'clang
1665 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1666 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1667 // provides *explicit vectorization hints* (LV can bypass legal checks and
1668 // assume that vectorization is legal). However, both hints are implemented
1669 // using the same metadata (llvm.loop.vectorize, processed by
1670 // LoopVectorizeHints). This will be fixed in the future when the native IR
1671 // representation for pragma 'omp simd' is introduced.
1672 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1673                                    OptimizationRemarkEmitter *ORE) {
1674   assert(!OuterLp->isInnermost() && "This is not an outer loop");
1675   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1676 
1677   // Only outer loops with an explicit vectorization hint are supported.
1678   // Unannotated outer loops are ignored.
1679   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1680     return false;
1681 
1682   Function *Fn = OuterLp->getHeader()->getParent();
1683   if (!Hints.allowVectorization(Fn, OuterLp,
1684                                 true /*VectorizeOnlyWhenForced*/)) {
1685     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1686     return false;
1687   }
1688 
1689   if (Hints.getInterleave() > 1) {
1690     // TODO: Interleave support is future work.
1691     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1692                          "outer loops.\n");
1693     Hints.emitRemarkWithHints();
1694     return false;
1695   }
1696 
1697   return true;
1698 }
1699 
1700 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1701                                   OptimizationRemarkEmitter *ORE,
1702                                   SmallVectorImpl<Loop *> &V) {
1703   // Collect inner loops and outer loops without irreducible control flow. For
1704   // now, only collect outer loops that have explicit vectorization hints. If we
1705   // are stress testing the VPlan H-CFG construction, we collect the outermost
1706   // loop of every loop nest.
1707   if (L.isInnermost() || VPlanBuildStressTest ||
1708       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1709     LoopBlocksRPO RPOT(&L);
1710     RPOT.perform(LI);
1711     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1712       V.push_back(&L);
1713       // TODO: Collect inner loops inside marked outer loops in case
1714       // vectorization fails for the outer loop. Do not invoke
1715       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1716       // already known to be reducible. We can use an inherited attribute for
1717       // that.
1718       return;
1719     }
1720   }
1721   for (Loop *InnerL : L)
1722     collectSupportedLoops(*InnerL, LI, ORE, V);
1723 }
1724 
1725 namespace {
1726 
1727 /// The LoopVectorize Pass.
1728 struct LoopVectorize : public FunctionPass {
1729   /// Pass identification, replacement for typeid
1730   static char ID;
1731 
1732   LoopVectorizePass Impl;
1733 
1734   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1735                          bool VectorizeOnlyWhenForced = false)
1736       : FunctionPass(ID),
1737         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
1738     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1739   }
1740 
1741   bool runOnFunction(Function &F) override {
1742     if (skipFunction(F))
1743       return false;
1744 
1745     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1746     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1747     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1748     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1749     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1750     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1751     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1752     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1753     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1754     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1755     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1756     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1757     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1758 
1759     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1760         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1761 
1762     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1763                         GetLAA, *ORE, PSI).MadeAnyChange;
1764   }
1765 
1766   void getAnalysisUsage(AnalysisUsage &AU) const override {
1767     AU.addRequired<AssumptionCacheTracker>();
1768     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1769     AU.addRequired<DominatorTreeWrapperPass>();
1770     AU.addRequired<LoopInfoWrapperPass>();
1771     AU.addRequired<ScalarEvolutionWrapperPass>();
1772     AU.addRequired<TargetTransformInfoWrapperPass>();
1773     AU.addRequired<AAResultsWrapperPass>();
1774     AU.addRequired<LoopAccessLegacyAnalysis>();
1775     AU.addRequired<DemandedBitsWrapperPass>();
1776     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1777     AU.addRequired<InjectTLIMappingsLegacy>();
1778 
1779     // We currently do not preserve loopinfo/dominator analyses with outer loop
1780     // vectorization. Until this is addressed, mark these analyses as preserved
1781     // only for non-VPlan-native path.
1782     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1783     if (!EnableVPlanNativePath) {
1784       AU.addPreserved<LoopInfoWrapperPass>();
1785       AU.addPreserved<DominatorTreeWrapperPass>();
1786     }
1787 
1788     AU.addPreserved<BasicAAWrapperPass>();
1789     AU.addPreserved<GlobalsAAWrapperPass>();
1790     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1791   }
1792 };
1793 
1794 } // end anonymous namespace
1795 
1796 //===----------------------------------------------------------------------===//
1797 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1798 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1799 //===----------------------------------------------------------------------===//
1800 
1801 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1802   // We need to place the broadcast of invariant variables outside the loop,
1803   // but only if it's proven safe to do so. Else, broadcast will be inside
1804   // vector loop body.
1805   Instruction *Instr = dyn_cast<Instruction>(V);
1806   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1807                      (!Instr ||
1808                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1809   // Place the code for broadcasting invariant variables in the new preheader.
1810   IRBuilder<>::InsertPointGuard Guard(Builder);
1811   if (SafeToHoist)
1812     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1813 
1814   // Broadcast the scalar into all locations in the vector.
1815   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1816 
1817   return Shuf;
1818 }
1819 
1820 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1821     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1822   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1823          "Expected either an induction phi-node or a truncate of it!");
1824   Value *Start = II.getStartValue();
1825 
1826   // Construct the initial value of the vector IV in the vector loop preheader
1827   auto CurrIP = Builder.saveIP();
1828   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1829   if (isa<TruncInst>(EntryVal)) {
1830     assert(Start->getType()->isIntegerTy() &&
1831            "Truncation requires an integer type");
1832     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1833     Step = Builder.CreateTrunc(Step, TruncType);
1834     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1835   }
1836   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1837   Value *SteppedStart =
1838       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1839 
1840   // We create vector phi nodes for both integer and floating-point induction
1841   // variables. Here, we determine the kind of arithmetic we will perform.
1842   Instruction::BinaryOps AddOp;
1843   Instruction::BinaryOps MulOp;
1844   if (Step->getType()->isIntegerTy()) {
1845     AddOp = Instruction::Add;
1846     MulOp = Instruction::Mul;
1847   } else {
1848     AddOp = II.getInductionOpcode();
1849     MulOp = Instruction::FMul;
1850   }
1851 
1852   // Multiply the vectorization factor by the step using integer or
1853   // floating-point arithmetic as appropriate.
1854   Value *ConstVF =
1855       getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue());
1856   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1857 
1858   // Create a vector splat to use in the induction update.
1859   //
1860   // FIXME: If the step is non-constant, we create the vector splat with
1861   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1862   //        handle a constant vector splat.
1863   assert(!VF.isScalable() && "scalable vectors not yet supported.");
1864   Value *SplatVF = isa<Constant>(Mul)
1865                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
1866                        : Builder.CreateVectorSplat(VF, Mul);
1867   Builder.restoreIP(CurrIP);
1868 
1869   // We may need to add the step a number of times, depending on the unroll
1870   // factor. The last of those goes into the PHI.
1871   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1872                                     &*LoopVectorBody->getFirstInsertionPt());
1873   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1874   Instruction *LastInduction = VecInd;
1875   for (unsigned Part = 0; Part < UF; ++Part) {
1876     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1877 
1878     if (isa<TruncInst>(EntryVal))
1879       addMetadata(LastInduction, EntryVal);
1880     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1881 
1882     LastInduction = cast<Instruction>(addFastMathFlag(
1883         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1884     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1885   }
1886 
1887   // Move the last step to the end of the latch block. This ensures consistent
1888   // placement of all induction updates.
1889   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1890   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1891   auto *ICmp = cast<Instruction>(Br->getCondition());
1892   LastInduction->moveBefore(ICmp);
1893   LastInduction->setName("vec.ind.next");
1894 
1895   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1896   VecInd->addIncoming(LastInduction, LoopVectorLatch);
1897 }
1898 
1899 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1900   return Cost->isScalarAfterVectorization(I, VF) ||
1901          Cost->isProfitableToScalarize(I, VF);
1902 }
1903 
1904 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1905   if (shouldScalarizeInstruction(IV))
1906     return true;
1907   auto isScalarInst = [&](User *U) -> bool {
1908     auto *I = cast<Instruction>(U);
1909     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1910   };
1911   return llvm::any_of(IV->users(), isScalarInst);
1912 }
1913 
1914 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1915     const InductionDescriptor &ID, const Instruction *EntryVal,
1916     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1917   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1918          "Expected either an induction phi-node or a truncate of it!");
1919 
1920   // This induction variable is not the phi from the original loop but the
1921   // newly-created IV based on the proof that casted Phi is equal to the
1922   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1923   // re-uses the same InductionDescriptor that original IV uses but we don't
1924   // have to do any recording in this case - that is done when original IV is
1925   // processed.
1926   if (isa<TruncInst>(EntryVal))
1927     return;
1928 
1929   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1930   if (Casts.empty())
1931     return;
1932   // Only the first Cast instruction in the Casts vector is of interest.
1933   // The rest of the Casts (if exist) have no uses outside the
1934   // induction update chain itself.
1935   Instruction *CastInst = *Casts.begin();
1936   if (Lane < UINT_MAX)
1937     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1938   else
1939     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1940 }
1941 
1942 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1943   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1944          "Primary induction variable must have an integer type");
1945 
1946   auto II = Legal->getInductionVars().find(IV);
1947   assert(II != Legal->getInductionVars().end() && "IV is not an induction");
1948 
1949   auto ID = II->second;
1950   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1951 
1952   // The value from the original loop to which we are mapping the new induction
1953   // variable.
1954   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1955 
1956   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1957 
1958   // Generate code for the induction step. Note that induction steps are
1959   // required to be loop-invariant
1960   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
1961     assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
1962            "Induction step should be loop invariant");
1963     if (PSE.getSE()->isSCEVable(IV->getType())) {
1964       SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1965       return Exp.expandCodeFor(Step, Step->getType(),
1966                                LoopVectorPreHeader->getTerminator());
1967     }
1968     return cast<SCEVUnknown>(Step)->getValue();
1969   };
1970 
1971   // The scalar value to broadcast. This is derived from the canonical
1972   // induction variable. If a truncation type is given, truncate the canonical
1973   // induction variable and step. Otherwise, derive these values from the
1974   // induction descriptor.
1975   auto CreateScalarIV = [&](Value *&Step) -> Value * {
1976     Value *ScalarIV = Induction;
1977     if (IV != OldInduction) {
1978       ScalarIV = IV->getType()->isIntegerTy()
1979                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1980                      : Builder.CreateCast(Instruction::SIToFP, Induction,
1981                                           IV->getType());
1982       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1983       ScalarIV->setName("offset.idx");
1984     }
1985     if (Trunc) {
1986       auto *TruncType = cast<IntegerType>(Trunc->getType());
1987       assert(Step->getType()->isIntegerTy() &&
1988              "Truncation requires an integer step");
1989       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1990       Step = Builder.CreateTrunc(Step, TruncType);
1991     }
1992     return ScalarIV;
1993   };
1994 
1995   // Create the vector values from the scalar IV, in the absence of creating a
1996   // vector IV.
1997   auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
1998     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1999     for (unsigned Part = 0; Part < UF; ++Part) {
2000       assert(!VF.isScalable() && "scalable vectors not yet supported.");
2001       Value *EntryPart =
2002           getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step,
2003                         ID.getInductionOpcode());
2004       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
2005       if (Trunc)
2006         addMetadata(EntryPart, Trunc);
2007       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
2008     }
2009   };
2010 
2011   // Now do the actual transformations, and start with creating the step value.
2012   Value *Step = CreateStepValue(ID.getStep());
2013   if (VF.isZero() || VF.isScalar()) {
2014     Value *ScalarIV = CreateScalarIV(Step);
2015     CreateSplatIV(ScalarIV, Step);
2016     return;
2017   }
2018 
2019   // Determine if we want a scalar version of the induction variable. This is
2020   // true if the induction variable itself is not widened, or if it has at
2021   // least one user in the loop that is not widened.
2022   auto NeedsScalarIV = needsScalarInduction(EntryVal);
2023   if (!NeedsScalarIV) {
2024     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
2025     return;
2026   }
2027 
2028   // Try to create a new independent vector induction variable. If we can't
2029   // create the phi node, we will splat the scalar induction variable in each
2030   // loop iteration.
2031   if (!shouldScalarizeInstruction(EntryVal)) {
2032     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
2033     Value *ScalarIV = CreateScalarIV(Step);
2034     // Create scalar steps that can be used by instructions we will later
2035     // scalarize. Note that the addition of the scalar steps will not increase
2036     // the number of instructions in the loop in the common case prior to
2037     // InstCombine. We will be trading one vector extract for each scalar step.
2038     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2039     return;
2040   }
2041 
2042   // All IV users are scalar instructions, so only emit a scalar IV, not a
2043   // vectorised IV. Except when we tail-fold, then the splat IV feeds the
2044   // predicate used by the masked loads/stores.
2045   Value *ScalarIV = CreateScalarIV(Step);
2046   if (!Cost->isScalarEpilogueAllowed())
2047     CreateSplatIV(ScalarIV, Step);
2048   buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2049 }
2050 
2051 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
2052                                           Instruction::BinaryOps BinOp) {
2053   // Create and check the types.
2054   auto *ValVTy = cast<FixedVectorType>(Val->getType());
2055   int VLen = ValVTy->getNumElements();
2056 
2057   Type *STy = Val->getType()->getScalarType();
2058   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2059          "Induction Step must be an integer or FP");
2060   assert(Step->getType() == STy && "Step has wrong type");
2061 
2062   SmallVector<Constant *, 8> Indices;
2063 
2064   if (STy->isIntegerTy()) {
2065     // Create a vector of consecutive numbers from zero to VF.
2066     for (int i = 0; i < VLen; ++i)
2067       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
2068 
2069     // Add the consecutive indices to the vector value.
2070     Constant *Cv = ConstantVector::get(Indices);
2071     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
2072     Step = Builder.CreateVectorSplat(VLen, Step);
2073     assert(Step->getType() == Val->getType() && "Invalid step vec");
2074     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2075     // which can be found from the original scalar operations.
2076     Step = Builder.CreateMul(Cv, Step);
2077     return Builder.CreateAdd(Val, Step, "induction");
2078   }
2079 
2080   // Floating point induction.
2081   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2082          "Binary Opcode should be specified for FP induction");
2083   // Create a vector of consecutive numbers from zero to VF.
2084   for (int i = 0; i < VLen; ++i)
2085     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
2086 
2087   // Add the consecutive indices to the vector value.
2088   Constant *Cv = ConstantVector::get(Indices);
2089 
2090   Step = Builder.CreateVectorSplat(VLen, Step);
2091 
2092   // Floating point operations had to be 'fast' to enable the induction.
2093   FastMathFlags Flags;
2094   Flags.setFast();
2095 
2096   Value *MulOp = Builder.CreateFMul(Cv, Step);
2097   if (isa<Instruction>(MulOp))
2098     // Have to check, MulOp may be a constant
2099     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
2100 
2101   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2102   if (isa<Instruction>(BOp))
2103     cast<Instruction>(BOp)->setFastMathFlags(Flags);
2104   return BOp;
2105 }
2106 
2107 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2108                                            Instruction *EntryVal,
2109                                            const InductionDescriptor &ID) {
2110   // We shouldn't have to build scalar steps if we aren't vectorizing.
2111   assert(VF.isVector() && "VF should be greater than one");
2112   assert(!VF.isScalable() &&
2113          "the code below assumes a fixed number of elements at compile time");
2114   // Get the value type and ensure it and the step have the same integer type.
2115   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2116   assert(ScalarIVTy == Step->getType() &&
2117          "Val and Step should have the same type");
2118 
2119   // We build scalar steps for both integer and floating-point induction
2120   // variables. Here, we determine the kind of arithmetic we will perform.
2121   Instruction::BinaryOps AddOp;
2122   Instruction::BinaryOps MulOp;
2123   if (ScalarIVTy->isIntegerTy()) {
2124     AddOp = Instruction::Add;
2125     MulOp = Instruction::Mul;
2126   } else {
2127     AddOp = ID.getInductionOpcode();
2128     MulOp = Instruction::FMul;
2129   }
2130 
2131   // Determine the number of scalars we need to generate for each unroll
2132   // iteration. If EntryVal is uniform, we only need to generate the first
2133   // lane. Otherwise, we generate all VF values.
2134   unsigned Lanes =
2135       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF)
2136           ? 1
2137           : VF.getKnownMinValue();
2138   // Compute the scalar steps and save the results in VectorLoopValueMap.
2139   for (unsigned Part = 0; Part < UF; ++Part) {
2140     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2141       auto *StartIdx = getSignedIntOrFpConstant(
2142           ScalarIVTy, VF.getKnownMinValue() * Part + Lane);
2143       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
2144       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
2145       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
2146       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
2147     }
2148   }
2149 }
2150 
2151 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
2152   assert(V != Induction && "The new induction variable should not be used.");
2153   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
2154   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2155 
2156   // If we have a stride that is replaced by one, do it here. Defer this for
2157   // the VPlan-native path until we start running Legal checks in that path.
2158   if (!EnableVPlanNativePath && Legal->hasStride(V))
2159     V = ConstantInt::get(V->getType(), 1);
2160 
2161   // If we have a vector mapped to this value, return it.
2162   if (VectorLoopValueMap.hasVectorValue(V, Part))
2163     return VectorLoopValueMap.getVectorValue(V, Part);
2164 
2165   // If the value has not been vectorized, check if it has been scalarized
2166   // instead. If it has been scalarized, and we actually need the value in
2167   // vector form, we will construct the vector values on demand.
2168   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2169     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2170 
2171     // If we've scalarized a value, that value should be an instruction.
2172     auto *I = cast<Instruction>(V);
2173 
2174     // If we aren't vectorizing, we can just copy the scalar map values over to
2175     // the vector map.
2176     if (VF.isScalar()) {
2177       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2178       return ScalarValue;
2179     }
2180 
2181     // Get the last scalar instruction we generated for V and Part. If the value
2182     // is known to be uniform after vectorization, this corresponds to lane zero
2183     // of the Part unroll iteration. Otherwise, the last instruction is the one
2184     // we created for the last vector lane of the Part unroll iteration.
2185     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2186     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF)
2187                             ? 0
2188                             : VF.getKnownMinValue() - 1;
2189     auto *LastInst = cast<Instruction>(
2190         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2191 
2192     // Set the insert point after the last scalarized instruction. This ensures
2193     // the insertelement sequence will directly follow the scalar definitions.
2194     auto OldIP = Builder.saveIP();
2195     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2196     Builder.SetInsertPoint(&*NewIP);
2197 
2198     // However, if we are vectorizing, we need to construct the vector values.
2199     // If the value is known to be uniform after vectorization, we can just
2200     // broadcast the scalar value corresponding to lane zero for each unroll
2201     // iteration. Otherwise, we construct the vector values using insertelement
2202     // instructions. Since the resulting vectors are stored in
2203     // VectorLoopValueMap, we will only generate the insertelements once.
2204     Value *VectorValue = nullptr;
2205     if (Cost->isUniformAfterVectorization(I, VF)) {
2206       VectorValue = getBroadcastInstrs(ScalarValue);
2207       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2208     } else {
2209       // Initialize packing with insertelements to start from undef.
2210       assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2211       Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2212       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2213       for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
2214         packScalarIntoVectorValue(V, {Part, Lane});
2215       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2216     }
2217     Builder.restoreIP(OldIP);
2218     return VectorValue;
2219   }
2220 
2221   // If this scalar is unknown, assume that it is a constant or that it is
2222   // loop invariant. Broadcast V and save the value for future uses.
2223   Value *B = getBroadcastInstrs(V);
2224   VectorLoopValueMap.setVectorValue(V, Part, B);
2225   return B;
2226 }
2227 
2228 Value *
2229 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2230                                             const VPIteration &Instance) {
2231   // If the value is not an instruction contained in the loop, it should
2232   // already be scalar.
2233   if (OrigLoop->isLoopInvariant(V))
2234     return V;
2235 
2236   assert(Instance.Lane > 0
2237              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2238              : true && "Uniform values only have lane zero");
2239 
2240   // If the value from the original loop has not been vectorized, it is
2241   // represented by UF x VF scalar values in the new loop. Return the requested
2242   // scalar value.
2243   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2244     return VectorLoopValueMap.getScalarValue(V, Instance);
2245 
2246   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2247   // for the given unroll part. If this entry is not a vector type (i.e., the
2248   // vectorization factor is one), there is no need to generate an
2249   // extractelement instruction.
2250   auto *U = getOrCreateVectorValue(V, Instance.Part);
2251   if (!U->getType()->isVectorTy()) {
2252     assert(VF.isScalar() && "Value not scalarized has non-vector type");
2253     return U;
2254   }
2255 
2256   // Otherwise, the value from the original loop has been vectorized and is
2257   // represented by UF vector values. Extract and return the requested scalar
2258   // value from the appropriate vector lane.
2259   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2260 }
2261 
2262 void InnerLoopVectorizer::packScalarIntoVectorValue(
2263     Value *V, const VPIteration &Instance) {
2264   assert(V != Induction && "The new induction variable should not be used.");
2265   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2266   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2267 
2268   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2269   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2270   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2271                                             Builder.getInt32(Instance.Lane));
2272   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2273 }
2274 
2275 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2276   assert(Vec->getType()->isVectorTy() && "Invalid type");
2277   assert(!VF.isScalable() && "Cannot reverse scalable vectors");
2278   SmallVector<int, 8> ShuffleMask;
2279   for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
2280     ShuffleMask.push_back(VF.getKnownMinValue() - i - 1);
2281 
2282   return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse");
2283 }
2284 
2285 // Return whether we allow using masked interleave-groups (for dealing with
2286 // strided loads/stores that reside in predicated blocks, or for dealing
2287 // with gaps).
2288 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2289   // If an override option has been passed in for interleaved accesses, use it.
2290   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2291     return EnableMaskedInterleavedMemAccesses;
2292 
2293   return TTI.enableMaskedInterleavedAccessVectorization();
2294 }
2295 
2296 // Try to vectorize the interleave group that \p Instr belongs to.
2297 //
2298 // E.g. Translate following interleaved load group (factor = 3):
2299 //   for (i = 0; i < N; i+=3) {
2300 //     R = Pic[i];             // Member of index 0
2301 //     G = Pic[i+1];           // Member of index 1
2302 //     B = Pic[i+2];           // Member of index 2
2303 //     ... // do something to R, G, B
2304 //   }
2305 // To:
2306 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2307 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2308 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2309 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2310 //
2311 // Or translate following interleaved store group (factor = 3):
2312 //   for (i = 0; i < N; i+=3) {
2313 //     ... do something to R, G, B
2314 //     Pic[i]   = R;           // Member of index 0
2315 //     Pic[i+1] = G;           // Member of index 1
2316 //     Pic[i+2] = B;           // Member of index 2
2317 //   }
2318 // To:
2319 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2320 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2321 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2322 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2323 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2324 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2325     const InterleaveGroup<Instruction> *Group, VPTransformState &State,
2326     VPValue *Addr, ArrayRef<VPValue *> StoredValues, VPValue *BlockInMask) {
2327   Instruction *Instr = Group->getInsertPos();
2328   const DataLayout &DL = Instr->getModule()->getDataLayout();
2329 
2330   // Prepare for the vector type of the interleaved load/store.
2331   Type *ScalarTy = getMemInstValueType(Instr);
2332   unsigned InterleaveFactor = Group->getFactor();
2333   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2334   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2335 
2336   // Prepare for the new pointers.
2337   SmallVector<Value *, 2> AddrParts;
2338   unsigned Index = Group->getIndex(Instr);
2339 
2340   // TODO: extend the masked interleaved-group support to reversed access.
2341   assert((!BlockInMask || !Group->isReverse()) &&
2342          "Reversed masked interleave-group not supported.");
2343 
2344   // If the group is reverse, adjust the index to refer to the last vector lane
2345   // instead of the first. We adjust the index from the first vector lane,
2346   // rather than directly getting the pointer for lane VF - 1, because the
2347   // pointer operand of the interleaved access is supposed to be uniform. For
2348   // uniform instructions, we're only required to generate a value for the
2349   // first vector lane in each unroll iteration.
2350   assert(!VF.isScalable() &&
2351          "scalable vector reverse operation is not implemented");
2352   if (Group->isReverse())
2353     Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2354 
2355   for (unsigned Part = 0; Part < UF; Part++) {
2356     Value *AddrPart = State.get(Addr, {Part, 0});
2357     setDebugLocFromInst(Builder, AddrPart);
2358 
2359     // Notice current instruction could be any index. Need to adjust the address
2360     // to the member of index 0.
2361     //
2362     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2363     //       b = A[i];       // Member of index 0
2364     // Current pointer is pointed to A[i+1], adjust it to A[i].
2365     //
2366     // E.g.  A[i+1] = a;     // Member of index 1
2367     //       A[i]   = b;     // Member of index 0
2368     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2369     // Current pointer is pointed to A[i+2], adjust it to A[i].
2370 
2371     bool InBounds = false;
2372     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2373       InBounds = gep->isInBounds();
2374     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2375     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2376 
2377     // Cast to the vector pointer type.
2378     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2379     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2380     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2381   }
2382 
2383   setDebugLocFromInst(Builder, Instr);
2384   Value *UndefVec = UndefValue::get(VecTy);
2385 
2386   Value *MaskForGaps = nullptr;
2387   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2388     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2389     MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2390     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2391   }
2392 
2393   // Vectorize the interleaved load group.
2394   if (isa<LoadInst>(Instr)) {
2395     // For each unroll part, create a wide load for the group.
2396     SmallVector<Value *, 2> NewLoads;
2397     for (unsigned Part = 0; Part < UF; Part++) {
2398       Instruction *NewLoad;
2399       if (BlockInMask || MaskForGaps) {
2400         assert(useMaskedInterleavedAccesses(*TTI) &&
2401                "masked interleaved groups are not allowed.");
2402         Value *GroupMask = MaskForGaps;
2403         if (BlockInMask) {
2404           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2405           assert(!VF.isScalable() && "scalable vectors not yet supported.");
2406           Value *ShuffledMask = Builder.CreateShuffleVector(
2407               BlockInMaskPart,
2408               createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2409               "interleaved.mask");
2410           GroupMask = MaskForGaps
2411                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2412                                                 MaskForGaps)
2413                           : ShuffledMask;
2414         }
2415         NewLoad =
2416             Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
2417                                      GroupMask, UndefVec, "wide.masked.vec");
2418       }
2419       else
2420         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2421                                             Group->getAlign(), "wide.vec");
2422       Group->addMetadata(NewLoad);
2423       NewLoads.push_back(NewLoad);
2424     }
2425 
2426     // For each member in the group, shuffle out the appropriate data from the
2427     // wide loads.
2428     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2429       Instruction *Member = Group->getMember(I);
2430 
2431       // Skip the gaps in the group.
2432       if (!Member)
2433         continue;
2434 
2435       assert(!VF.isScalable() && "scalable vectors not yet supported.");
2436       auto StrideMask =
2437           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2438       for (unsigned Part = 0; Part < UF; Part++) {
2439         Value *StridedVec = Builder.CreateShuffleVector(
2440             NewLoads[Part], StrideMask, "strided.vec");
2441 
2442         // If this member has different type, cast the result type.
2443         if (Member->getType() != ScalarTy) {
2444           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2445           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2446           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2447         }
2448 
2449         if (Group->isReverse())
2450           StridedVec = reverseVector(StridedVec);
2451 
2452         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2453       }
2454     }
2455     return;
2456   }
2457 
2458   // The sub vector type for current instruction.
2459   assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2460   auto *SubVT = VectorType::get(ScalarTy, VF);
2461 
2462   // Vectorize the interleaved store group.
2463   for (unsigned Part = 0; Part < UF; Part++) {
2464     // Collect the stored vector from each member.
2465     SmallVector<Value *, 4> StoredVecs;
2466     for (unsigned i = 0; i < InterleaveFactor; i++) {
2467       // Interleaved store group doesn't allow a gap, so each index has a member
2468       assert(Group->getMember(i) && "Fail to get a member from an interleaved store group");
2469 
2470       Value *StoredVec = State.get(StoredValues[i], Part);
2471 
2472       if (Group->isReverse())
2473         StoredVec = reverseVector(StoredVec);
2474 
2475       // If this member has different type, cast it to a unified type.
2476 
2477       if (StoredVec->getType() != SubVT)
2478         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2479 
2480       StoredVecs.push_back(StoredVec);
2481     }
2482 
2483     // Concatenate all vectors into a wide vector.
2484     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2485 
2486     // Interleave the elements in the wide vector.
2487     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2488     Value *IVec = Builder.CreateShuffleVector(
2489         WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2490         "interleaved.vec");
2491 
2492     Instruction *NewStoreInstr;
2493     if (BlockInMask) {
2494       Value *BlockInMaskPart = State.get(BlockInMask, Part);
2495       Value *ShuffledMask = Builder.CreateShuffleVector(
2496           BlockInMaskPart,
2497           createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2498           "interleaved.mask");
2499       NewStoreInstr = Builder.CreateMaskedStore(
2500           IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
2501     }
2502     else
2503       NewStoreInstr =
2504           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2505 
2506     Group->addMetadata(NewStoreInstr);
2507   }
2508 }
2509 
2510 void InnerLoopVectorizer::vectorizeMemoryInstruction(
2511     Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr,
2512     VPValue *StoredValue, VPValue *BlockInMask) {
2513   // Attempt to issue a wide load.
2514   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2515   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2516 
2517   assert((LI || SI) && "Invalid Load/Store instruction");
2518   assert((!SI || StoredValue) && "No stored value provided for widened store");
2519   assert((!LI || !StoredValue) && "Stored value provided for widened load");
2520 
2521   LoopVectorizationCostModel::InstWidening Decision =
2522       Cost->getWideningDecision(Instr, VF);
2523   assert((Decision == LoopVectorizationCostModel::CM_Widen ||
2524           Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||
2525           Decision == LoopVectorizationCostModel::CM_GatherScatter) &&
2526          "CM decision is not to widen the memory instruction");
2527 
2528   Type *ScalarDataTy = getMemInstValueType(Instr);
2529 
2530   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2531   auto *DataTy = VectorType::get(ScalarDataTy, VF);
2532   const Align Alignment = getLoadStoreAlignment(Instr);
2533 
2534   // Determine if the pointer operand of the access is either consecutive or
2535   // reverse consecutive.
2536   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2537   bool ConsecutiveStride =
2538       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2539   bool CreateGatherScatter =
2540       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2541 
2542   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2543   // gather/scatter. Otherwise Decision should have been to Scalarize.
2544   assert((ConsecutiveStride || CreateGatherScatter) &&
2545          "The instruction should be scalarized");
2546   (void)ConsecutiveStride;
2547 
2548   VectorParts BlockInMaskParts(UF);
2549   bool isMaskRequired = BlockInMask;
2550   if (isMaskRequired)
2551     for (unsigned Part = 0; Part < UF; ++Part)
2552       BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2553 
2554   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2555     // Calculate the pointer for the specific unroll-part.
2556     GetElementPtrInst *PartPtr = nullptr;
2557 
2558     bool InBounds = false;
2559     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2560       InBounds = gep->isInBounds();
2561 
2562     if (Reverse) {
2563       // If the address is consecutive but reversed, then the
2564       // wide store needs to start at the last vector element.
2565       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2566           ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue())));
2567       PartPtr->setIsInBounds(InBounds);
2568       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2569           ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue())));
2570       PartPtr->setIsInBounds(InBounds);
2571       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2572         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2573     } else {
2574       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2575           ScalarDataTy, Ptr, Builder.getInt32(Part * VF.getKnownMinValue())));
2576       PartPtr->setIsInBounds(InBounds);
2577     }
2578 
2579     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2580     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2581   };
2582 
2583   // Handle Stores:
2584   if (SI) {
2585     setDebugLocFromInst(Builder, SI);
2586 
2587     for (unsigned Part = 0; Part < UF; ++Part) {
2588       Instruction *NewSI = nullptr;
2589       Value *StoredVal = State.get(StoredValue, Part);
2590       if (CreateGatherScatter) {
2591         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2592         Value *VectorGep = State.get(Addr, Part);
2593         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2594                                             MaskPart);
2595       } else {
2596         if (Reverse) {
2597           // If we store to reverse consecutive memory locations, then we need
2598           // to reverse the order of elements in the stored value.
2599           StoredVal = reverseVector(StoredVal);
2600           // We don't want to update the value in the map as it might be used in
2601           // another expression. So don't call resetVectorValue(StoredVal).
2602         }
2603         auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2604         if (isMaskRequired)
2605           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2606                                             BlockInMaskParts[Part]);
2607         else
2608           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2609       }
2610       addMetadata(NewSI, SI);
2611     }
2612     return;
2613   }
2614 
2615   // Handle loads.
2616   assert(LI && "Must have a load instruction");
2617   setDebugLocFromInst(Builder, LI);
2618   for (unsigned Part = 0; Part < UF; ++Part) {
2619     Value *NewLI;
2620     if (CreateGatherScatter) {
2621       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2622       Value *VectorGep = State.get(Addr, Part);
2623       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2624                                          nullptr, "wide.masked.gather");
2625       addMetadata(NewLI, LI);
2626     } else {
2627       auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2628       if (isMaskRequired)
2629         NewLI = Builder.CreateMaskedLoad(
2630             VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy),
2631             "wide.masked.load");
2632       else
2633         NewLI =
2634             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2635 
2636       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2637       addMetadata(NewLI, LI);
2638       if (Reverse)
2639         NewLI = reverseVector(NewLI);
2640     }
2641 
2642     State.set(Def, Instr, NewLI, Part);
2643   }
2644 }
2645 
2646 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User,
2647                                                const VPIteration &Instance,
2648                                                bool IfPredicateInstr,
2649                                                VPTransformState &State) {
2650   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2651 
2652   setDebugLocFromInst(Builder, Instr);
2653 
2654   // Does this instruction return a value ?
2655   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2656 
2657   Instruction *Cloned = Instr->clone();
2658   if (!IsVoidRetTy)
2659     Cloned->setName(Instr->getName() + ".cloned");
2660 
2661   // Replace the operands of the cloned instructions with their scalar
2662   // equivalents in the new loop.
2663   for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
2664     auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op));
2665     auto InputInstance = Instance;
2666     if (!Operand || !OrigLoop->contains(Operand) ||
2667         (Cost->isUniformAfterVectorization(Operand, State.VF)))
2668       InputInstance.Lane = 0;
2669     auto *NewOp = State.get(User.getOperand(op), InputInstance);
2670     Cloned->setOperand(op, NewOp);
2671   }
2672   addNewMetadata(Cloned, Instr);
2673 
2674   // Place the cloned scalar in the new loop.
2675   Builder.Insert(Cloned);
2676 
2677   // TODO: Set result for VPValue of VPReciplicateRecipe. This requires
2678   // representing scalar values in VPTransformState. Add the cloned scalar to
2679   // the scalar map entry.
2680   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2681 
2682   // If we just cloned a new assumption, add it the assumption cache.
2683   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2684     if (II->getIntrinsicID() == Intrinsic::assume)
2685       AC->registerAssumption(II);
2686 
2687   // End if-block.
2688   if (IfPredicateInstr)
2689     PredicatedInstructions.push_back(Cloned);
2690 }
2691 
2692 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2693                                                       Value *End, Value *Step,
2694                                                       Instruction *DL) {
2695   BasicBlock *Header = L->getHeader();
2696   BasicBlock *Latch = L->getLoopLatch();
2697   // As we're just creating this loop, it's possible no latch exists
2698   // yet. If so, use the header as this will be a single block loop.
2699   if (!Latch)
2700     Latch = Header;
2701 
2702   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2703   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2704   setDebugLocFromInst(Builder, OldInst);
2705   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2706 
2707   Builder.SetInsertPoint(Latch->getTerminator());
2708   setDebugLocFromInst(Builder, OldInst);
2709 
2710   // Create i+1 and fill the PHINode.
2711   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2712   Induction->addIncoming(Start, L->getLoopPreheader());
2713   Induction->addIncoming(Next, Latch);
2714   // Create the compare.
2715   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2716   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2717 
2718   // Now we have two terminators. Remove the old one from the block.
2719   Latch->getTerminator()->eraseFromParent();
2720 
2721   return Induction;
2722 }
2723 
2724 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2725   if (TripCount)
2726     return TripCount;
2727 
2728   assert(L && "Create Trip Count for null loop.");
2729   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2730   // Find the loop boundaries.
2731   ScalarEvolution *SE = PSE.getSE();
2732   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2733   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
2734          "Invalid loop count");
2735 
2736   Type *IdxTy = Legal->getWidestInductionType();
2737   assert(IdxTy && "No type for induction");
2738 
2739   // The exit count might have the type of i64 while the phi is i32. This can
2740   // happen if we have an induction variable that is sign extended before the
2741   // compare. The only way that we get a backedge taken count is that the
2742   // induction variable was signed and as such will not overflow. In such a case
2743   // truncation is legal.
2744   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
2745       IdxTy->getPrimitiveSizeInBits())
2746     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2747   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2748 
2749   // Get the total trip count from the count by adding 1.
2750   const SCEV *ExitCount = SE->getAddExpr(
2751       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2752 
2753   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2754 
2755   // Expand the trip count and place the new instructions in the preheader.
2756   // Notice that the pre-header does not change, only the loop body.
2757   SCEVExpander Exp(*SE, DL, "induction");
2758 
2759   // Count holds the overall loop count (N).
2760   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2761                                 L->getLoopPreheader()->getTerminator());
2762 
2763   if (TripCount->getType()->isPointerTy())
2764     TripCount =
2765         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2766                                     L->getLoopPreheader()->getTerminator());
2767 
2768   return TripCount;
2769 }
2770 
2771 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2772   if (VectorTripCount)
2773     return VectorTripCount;
2774 
2775   Value *TC = getOrCreateTripCount(L);
2776   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2777 
2778   Type *Ty = TC->getType();
2779   // This is where we can make the step a runtime constant.
2780   assert(!VF.isScalable() && "scalable vectorization is not supported yet");
2781   Constant *Step = ConstantInt::get(Ty, VF.getKnownMinValue() * UF);
2782 
2783   // If the tail is to be folded by masking, round the number of iterations N
2784   // up to a multiple of Step instead of rounding down. This is done by first
2785   // adding Step-1 and then rounding down. Note that it's ok if this addition
2786   // overflows: the vector induction variable will eventually wrap to zero given
2787   // that it starts at zero and its Step is a power of two; the loop will then
2788   // exit, with the last early-exit vector comparison also producing all-true.
2789   if (Cost->foldTailByMasking()) {
2790     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2791            "VF*UF must be a power of 2 when folding tail by masking");
2792     TC = Builder.CreateAdd(
2793         TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
2794   }
2795 
2796   // Now we need to generate the expression for the part of the loop that the
2797   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2798   // iterations are not required for correctness, or N - Step, otherwise. Step
2799   // is equal to the vectorization factor (number of SIMD elements) times the
2800   // unroll factor (number of SIMD instructions).
2801   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2802 
2803   // If there is a non-reversed interleaved group that may speculatively access
2804   // memory out-of-bounds, we need to ensure that there will be at least one
2805   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2806   // the trip count, we set the remainder to be equal to the step. If the step
2807   // does not evenly divide the trip count, no adjustment is necessary since
2808   // there will already be scalar iterations. Note that the minimum iterations
2809   // check ensures that N >= Step.
2810   if (VF.isVector() && Cost->requiresScalarEpilogue()) {
2811     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2812     R = Builder.CreateSelect(IsZero, Step, R);
2813   }
2814 
2815   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2816 
2817   return VectorTripCount;
2818 }
2819 
2820 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2821                                                    const DataLayout &DL) {
2822   // Verify that V is a vector type with same number of elements as DstVTy.
2823   auto *DstFVTy = cast<FixedVectorType>(DstVTy);
2824   unsigned VF = DstFVTy->getNumElements();
2825   auto *SrcVecTy = cast<FixedVectorType>(V->getType());
2826   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2827   Type *SrcElemTy = SrcVecTy->getElementType();
2828   Type *DstElemTy = DstFVTy->getElementType();
2829   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2830          "Vector elements must have same size");
2831 
2832   // Do a direct cast if element types are castable.
2833   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2834     return Builder.CreateBitOrPointerCast(V, DstFVTy);
2835   }
2836   // V cannot be directly casted to desired vector type.
2837   // May happen when V is a floating point vector but DstVTy is a vector of
2838   // pointers or vice-versa. Handle this using a two-step bitcast using an
2839   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2840   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2841          "Only one type should be a pointer type");
2842   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2843          "Only one type should be a floating point type");
2844   Type *IntTy =
2845       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2846   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
2847   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2848   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
2849 }
2850 
2851 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2852                                                          BasicBlock *Bypass) {
2853   Value *Count = getOrCreateTripCount(L);
2854   // Reuse existing vector loop preheader for TC checks.
2855   // Note that new preheader block is generated for vector loop.
2856   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2857   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2858 
2859   // Generate code to check if the loop's trip count is less than VF * UF, or
2860   // equal to it in case a scalar epilogue is required; this implies that the
2861   // vector trip count is zero. This check also covers the case where adding one
2862   // to the backedge-taken count overflowed leading to an incorrect trip count
2863   // of zero. In this case we will also jump to the scalar loop.
2864   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2865                                           : ICmpInst::ICMP_ULT;
2866 
2867   // If tail is to be folded, vector loop takes care of all iterations.
2868   Value *CheckMinIters = Builder.getFalse();
2869   if (!Cost->foldTailByMasking()) {
2870     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2871     CheckMinIters = Builder.CreateICmp(
2872         P, Count,
2873         ConstantInt::get(Count->getType(), VF.getKnownMinValue() * UF),
2874         "min.iters.check");
2875   }
2876   // Create new preheader for vector loop.
2877   LoopVectorPreHeader =
2878       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2879                  "vector.ph");
2880 
2881   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2882                                DT->getNode(Bypass)->getIDom()) &&
2883          "TC check is expected to dominate Bypass");
2884 
2885   // Update dominator for Bypass & LoopExit.
2886   DT->changeImmediateDominator(Bypass, TCCheckBlock);
2887   DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
2888 
2889   ReplaceInstWithInst(
2890       TCCheckBlock->getTerminator(),
2891       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
2892   LoopBypassBlocks.push_back(TCCheckBlock);
2893 }
2894 
2895 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2896   // Reuse existing vector loop preheader for SCEV checks.
2897   // Note that new preheader block is generated for vector loop.
2898   BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
2899 
2900   // Generate the code to check that the SCEV assumptions that we made.
2901   // We want the new basic block to start at the first instruction in a
2902   // sequence of instructions that form a check.
2903   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2904                    "scev.check");
2905   Value *SCEVCheck = Exp.expandCodeForPredicate(
2906       &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
2907 
2908   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2909     if (C->isZero())
2910       return;
2911 
2912   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2913            (OptForSizeBasedOnProfile &&
2914             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
2915          "Cannot SCEV check stride or overflow when optimizing for size");
2916 
2917   SCEVCheckBlock->setName("vector.scevcheck");
2918   // Create new preheader for vector loop.
2919   LoopVectorPreHeader =
2920       SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
2921                  nullptr, "vector.ph");
2922 
2923   // Update dominator only if this is first RT check.
2924   if (LoopBypassBlocks.empty()) {
2925     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2926     DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2927   }
2928 
2929   ReplaceInstWithInst(
2930       SCEVCheckBlock->getTerminator(),
2931       BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
2932   LoopBypassBlocks.push_back(SCEVCheckBlock);
2933   AddedSafetyChecks = true;
2934 }
2935 
2936 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2937   // VPlan-native path does not do any analysis for runtime checks currently.
2938   if (EnableVPlanNativePath)
2939     return;
2940 
2941   // Reuse existing vector loop preheader for runtime memory checks.
2942   // Note that new preheader block is generated for vector loop.
2943   BasicBlock *const MemCheckBlock = L->getLoopPreheader();
2944 
2945   // Generate the code that checks in runtime if arrays overlap. We put the
2946   // checks into a separate block to make the more common case of few elements
2947   // faster.
2948   auto *LAI = Legal->getLAI();
2949   const auto &RtPtrChecking = *LAI->getRuntimePointerChecking();
2950   if (!RtPtrChecking.Need)
2951     return;
2952 
2953   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
2954     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2955            "Cannot emit memory checks when optimizing for size, unless forced "
2956            "to vectorize.");
2957     ORE->emit([&]() {
2958       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2959                                         L->getStartLoc(), L->getHeader())
2960              << "Code-size may be reduced by not forcing "
2961                 "vectorization, or by source-code modifications "
2962                 "eliminating the need for runtime checks "
2963                 "(e.g., adding 'restrict').";
2964     });
2965   }
2966 
2967   MemCheckBlock->setName("vector.memcheck");
2968   // Create new preheader for vector loop.
2969   LoopVectorPreHeader =
2970       SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
2971                  "vector.ph");
2972 
2973   auto *CondBranch = cast<BranchInst>(
2974       Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader));
2975   ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch);
2976   LoopBypassBlocks.push_back(MemCheckBlock);
2977   AddedSafetyChecks = true;
2978 
2979   // Update dominator only if this is first RT check.
2980   if (LoopBypassBlocks.empty()) {
2981     DT->changeImmediateDominator(Bypass, MemCheckBlock);
2982     DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
2983   }
2984 
2985   Instruction *FirstCheckInst;
2986   Instruction *MemRuntimeCheck;
2987   std::tie(FirstCheckInst, MemRuntimeCheck) =
2988       addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop,
2989                        RtPtrChecking.getChecks(), RtPtrChecking.getSE());
2990   assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
2991                             "claimed checks are required");
2992   CondBranch->setCondition(MemRuntimeCheck);
2993 
2994   // We currently don't use LoopVersioning for the actual loop cloning but we
2995   // still use it to add the noalias metadata.
2996   LVer = std::make_unique<LoopVersioning>(
2997       *Legal->getLAI(),
2998       Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
2999       DT, PSE.getSE());
3000   LVer->prepareNoAliasMetadata();
3001 }
3002 
3003 Value *InnerLoopVectorizer::emitTransformedIndex(
3004     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
3005     const InductionDescriptor &ID) const {
3006 
3007   SCEVExpander Exp(*SE, DL, "induction");
3008   auto Step = ID.getStep();
3009   auto StartValue = ID.getStartValue();
3010   assert(Index->getType() == Step->getType() &&
3011          "Index type does not match StepValue type");
3012 
3013   // Note: the IR at this point is broken. We cannot use SE to create any new
3014   // SCEV and then expand it, hoping that SCEV's simplification will give us
3015   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
3016   // lead to various SCEV crashes. So all we can do is to use builder and rely
3017   // on InstCombine for future simplifications. Here we handle some trivial
3018   // cases only.
3019   auto CreateAdd = [&B](Value *X, Value *Y) {
3020     assert(X->getType() == Y->getType() && "Types don't match!");
3021     if (auto *CX = dyn_cast<ConstantInt>(X))
3022       if (CX->isZero())
3023         return Y;
3024     if (auto *CY = dyn_cast<ConstantInt>(Y))
3025       if (CY->isZero())
3026         return X;
3027     return B.CreateAdd(X, Y);
3028   };
3029 
3030   auto CreateMul = [&B](Value *X, Value *Y) {
3031     assert(X->getType() == Y->getType() && "Types don't match!");
3032     if (auto *CX = dyn_cast<ConstantInt>(X))
3033       if (CX->isOne())
3034         return Y;
3035     if (auto *CY = dyn_cast<ConstantInt>(Y))
3036       if (CY->isOne())
3037         return X;
3038     return B.CreateMul(X, Y);
3039   };
3040 
3041   // Get a suitable insert point for SCEV expansion. For blocks in the vector
3042   // loop, choose the end of the vector loop header (=LoopVectorBody), because
3043   // the DomTree is not kept up-to-date for additional blocks generated in the
3044   // vector loop. By using the header as insertion point, we guarantee that the
3045   // expanded instructions dominate all their uses.
3046   auto GetInsertPoint = [this, &B]() {
3047     BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
3048     if (InsertBB != LoopVectorBody &&
3049         LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
3050       return LoopVectorBody->getTerminator();
3051     return &*B.GetInsertPoint();
3052   };
3053   switch (ID.getKind()) {
3054   case InductionDescriptor::IK_IntInduction: {
3055     assert(Index->getType() == StartValue->getType() &&
3056            "Index type does not match StartValue type");
3057     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
3058       return B.CreateSub(StartValue, Index);
3059     auto *Offset = CreateMul(
3060         Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
3061     return CreateAdd(StartValue, Offset);
3062   }
3063   case InductionDescriptor::IK_PtrInduction: {
3064     assert(isa<SCEVConstant>(Step) &&
3065            "Expected constant step for pointer induction");
3066     return B.CreateGEP(
3067         StartValue->getType()->getPointerElementType(), StartValue,
3068         CreateMul(Index,
3069                   Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())));
3070   }
3071   case InductionDescriptor::IK_FpInduction: {
3072     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
3073     auto InductionBinOp = ID.getInductionBinOp();
3074     assert(InductionBinOp &&
3075            (InductionBinOp->getOpcode() == Instruction::FAdd ||
3076             InductionBinOp->getOpcode() == Instruction::FSub) &&
3077            "Original bin op should be defined for FP induction");
3078 
3079     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
3080 
3081     // Floating point operations had to be 'fast' to enable the induction.
3082     FastMathFlags Flags;
3083     Flags.setFast();
3084 
3085     Value *MulExp = B.CreateFMul(StepValue, Index);
3086     if (isa<Instruction>(MulExp))
3087       // We have to check, the MulExp may be a constant.
3088       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
3089 
3090     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
3091                                "induction");
3092     if (isa<Instruction>(BOp))
3093       cast<Instruction>(BOp)->setFastMathFlags(Flags);
3094 
3095     return BOp;
3096   }
3097   case InductionDescriptor::IK_NoInduction:
3098     return nullptr;
3099   }
3100   llvm_unreachable("invalid enum");
3101 }
3102 
3103 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3104   LoopScalarBody = OrigLoop->getHeader();
3105   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3106   LoopExitBlock = OrigLoop->getExitBlock();
3107   assert(LoopExitBlock && "Must have an exit block");
3108   assert(LoopVectorPreHeader && "Invalid loop structure");
3109 
3110   LoopMiddleBlock =
3111       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3112                  LI, nullptr, Twine(Prefix) + "middle.block");
3113   LoopScalarPreHeader =
3114       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3115                  nullptr, Twine(Prefix) + "scalar.ph");
3116   // We intentionally don't let SplitBlock to update LoopInfo since
3117   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3118   // LoopVectorBody is explicitly added to the correct place few lines later.
3119   LoopVectorBody =
3120       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3121                  nullptr, nullptr, Twine(Prefix) + "vector.body");
3122 
3123   // Update dominator for loop exit.
3124   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3125 
3126   // Create and register the new vector loop.
3127   Loop *Lp = LI->AllocateLoop();
3128   Loop *ParentLoop = OrigLoop->getParentLoop();
3129 
3130   // Insert the new loop into the loop nest and register the new basic blocks
3131   // before calling any utilities such as SCEV that require valid LoopInfo.
3132   if (ParentLoop) {
3133     ParentLoop->addChildLoop(Lp);
3134   } else {
3135     LI->addTopLevelLoop(Lp);
3136   }
3137   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3138   return Lp;
3139 }
3140 
3141 void InnerLoopVectorizer::createInductionResumeValues(Loop *L,
3142                                                       Value *VectorTripCount) {
3143   assert(VectorTripCount && L && "Expected valid arguments");
3144   // We are going to resume the execution of the scalar loop.
3145   // Go over all of the induction variables that we found and fix the
3146   // PHIs that are left in the scalar version of the loop.
3147   // The starting values of PHI nodes depend on the counter of the last
3148   // iteration in the vectorized loop.
3149   // If we come from a bypass edge then we need to start from the original
3150   // start value.
3151   for (auto &InductionEntry : Legal->getInductionVars()) {
3152     PHINode *OrigPhi = InductionEntry.first;
3153     InductionDescriptor II = InductionEntry.second;
3154 
3155     // Create phi nodes to merge from the  backedge-taken check block.
3156     PHINode *BCResumeVal =
3157         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3158                         LoopScalarPreHeader->getTerminator());
3159     // Copy original phi DL over to the new one.
3160     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3161     Value *&EndValue = IVEndValues[OrigPhi];
3162     if (OrigPhi == OldInduction) {
3163       // We know what the end value is.
3164       EndValue = VectorTripCount;
3165     } else {
3166       IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3167       Type *StepType = II.getStep()->getType();
3168       Instruction::CastOps CastOp =
3169           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3170       Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3171       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3172       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3173       EndValue->setName("ind.end");
3174     }
3175 
3176     // The new PHI merges the original incoming value, in case of a bypass,
3177     // or the value at the end of the vectorized loop.
3178     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3179 
3180     // Fix the scalar body counter (PHI node).
3181     // The old induction's phi node in the scalar body needs the truncated
3182     // value.
3183     for (BasicBlock *BB : LoopBypassBlocks)
3184       BCResumeVal->addIncoming(II.getStartValue(), BB);
3185     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3186   }
3187 }
3188 
3189 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3190                                                       MDNode *OrigLoopID) {
3191   assert(L && "Expected valid loop.");
3192 
3193   // The trip counts should be cached by now.
3194   Value *Count = getOrCreateTripCount(L);
3195   Value *VectorTripCount = getOrCreateVectorTripCount(L);
3196 
3197   // We need the OrigLoop (scalar loop part) latch terminator to help
3198   // produce correct debug info for the middle block BB instructions.
3199   // The legality check stage guarantees that the loop will have a single
3200   // latch.
3201   assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
3202          "Scalar loop latch terminator isn't a branch");
3203   BranchInst *ScalarLatchBr =
3204       cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3205 
3206   // Add a check in the middle block to see if we have completed
3207   // all of the iterations in the first vector loop.
3208   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3209   // If tail is to be folded, we know we don't need to run the remainder.
3210   Value *CmpN = Builder.getTrue();
3211   if (!Cost->foldTailByMasking()) {
3212     CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3213                            VectorTripCount, "cmp.n",
3214                            LoopMiddleBlock->getTerminator());
3215 
3216     // Here we use the same DebugLoc as the scalar loop latch branch instead
3217     // of the corresponding compare because they may have ended up with
3218     // different line numbers and we want to avoid awkward line stepping while
3219     // debugging. Eg. if the compare has got a line number inside the loop.
3220     cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3221   }
3222 
3223   BranchInst *BrInst =
3224       BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN);
3225   BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3226   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3227 
3228   // Get ready to start creating new instructions into the vectorized body.
3229   assert(LoopVectorPreHeader == L->getLoopPreheader() &&
3230          "Inconsistent vector loop preheader");
3231   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3232 
3233   Optional<MDNode *> VectorizedLoopID =
3234       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3235                                       LLVMLoopVectorizeFollowupVectorized});
3236   if (VectorizedLoopID.hasValue()) {
3237     L->setLoopID(VectorizedLoopID.getValue());
3238 
3239     // Do not setAlreadyVectorized if loop attributes have been defined
3240     // explicitly.
3241     return LoopVectorPreHeader;
3242   }
3243 
3244   // Keep all loop hints from the original loop on the vector loop (we'll
3245   // replace the vectorizer-specific hints below).
3246   if (MDNode *LID = OrigLoop->getLoopID())
3247     L->setLoopID(LID);
3248 
3249   LoopVectorizeHints Hints(L, true, *ORE);
3250   Hints.setAlreadyVectorized();
3251 
3252 #ifdef EXPENSIVE_CHECKS
3253   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3254   LI->verify(*DT);
3255 #endif
3256 
3257   return LoopVectorPreHeader;
3258 }
3259 
3260 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3261   /*
3262    In this function we generate a new loop. The new loop will contain
3263    the vectorized instructions while the old loop will continue to run the
3264    scalar remainder.
3265 
3266        [ ] <-- loop iteration number check.
3267     /   |
3268    /    v
3269   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3270   |  /  |
3271   | /   v
3272   ||   [ ]     <-- vector pre header.
3273   |/    |
3274   |     v
3275   |    [  ] \
3276   |    [  ]_|   <-- vector loop.
3277   |     |
3278   |     v
3279   |   -[ ]   <--- middle-block.
3280   |  /  |
3281   | /   v
3282   -|- >[ ]     <--- new preheader.
3283    |    |
3284    |    v
3285    |   [ ] \
3286    |   [ ]_|   <-- old scalar loop to handle remainder.
3287     \   |
3288      \  v
3289       >[ ]     <-- exit block.
3290    ...
3291    */
3292 
3293   // Get the metadata of the original loop before it gets modified.
3294   MDNode *OrigLoopID = OrigLoop->getLoopID();
3295 
3296   // Create an empty vector loop, and prepare basic blocks for the runtime
3297   // checks.
3298   Loop *Lp = createVectorLoopSkeleton("");
3299 
3300   // Now, compare the new count to zero. If it is zero skip the vector loop and
3301   // jump to the scalar loop. This check also covers the case where the
3302   // backedge-taken count is uint##_max: adding one to it will overflow leading
3303   // to an incorrect trip count of zero. In this (rare) case we will also jump
3304   // to the scalar loop.
3305   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3306 
3307   // Generate the code to check any assumptions that we've made for SCEV
3308   // expressions.
3309   emitSCEVChecks(Lp, LoopScalarPreHeader);
3310 
3311   // Generate the code that checks in runtime if arrays overlap. We put the
3312   // checks into a separate block to make the more common case of few elements
3313   // faster.
3314   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3315 
3316   // Some loops have a single integer induction variable, while other loops
3317   // don't. One example is c++ iterators that often have multiple pointer
3318   // induction variables. In the code below we also support a case where we
3319   // don't have a single induction variable.
3320   //
3321   // We try to obtain an induction variable from the original loop as hard
3322   // as possible. However if we don't find one that:
3323   //   - is an integer
3324   //   - counts from zero, stepping by one
3325   //   - is the size of the widest induction variable type
3326   // then we create a new one.
3327   OldInduction = Legal->getPrimaryInduction();
3328   Type *IdxTy = Legal->getWidestInductionType();
3329   Value *StartIdx = ConstantInt::get(IdxTy, 0);
3330   // The loop step is equal to the vectorization factor (num of SIMD elements)
3331   // times the unroll factor (num of SIMD instructions).
3332   assert(!VF.isScalable() && "scalable vectors not yet supported.");
3333   Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
3334   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3335   Induction =
3336       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3337                               getDebugLocFromInstOrOperands(OldInduction));
3338 
3339   // Emit phis for the new starting index of the scalar loop.
3340   createInductionResumeValues(Lp, CountRoundDown);
3341 
3342   return completeLoopSkeleton(Lp, OrigLoopID);
3343 }
3344 
3345 // Fix up external users of the induction variable. At this point, we are
3346 // in LCSSA form, with all external PHIs that use the IV having one input value,
3347 // coming from the remainder loop. We need those PHIs to also have a correct
3348 // value for the IV when arriving directly from the middle block.
3349 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3350                                        const InductionDescriptor &II,
3351                                        Value *CountRoundDown, Value *EndValue,
3352                                        BasicBlock *MiddleBlock) {
3353   // There are two kinds of external IV usages - those that use the value
3354   // computed in the last iteration (the PHI) and those that use the penultimate
3355   // value (the value that feeds into the phi from the loop latch).
3356   // We allow both, but they, obviously, have different values.
3357 
3358   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3359 
3360   DenseMap<Value *, Value *> MissingVals;
3361 
3362   // An external user of the last iteration's value should see the value that
3363   // the remainder loop uses to initialize its own IV.
3364   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3365   for (User *U : PostInc->users()) {
3366     Instruction *UI = cast<Instruction>(U);
3367     if (!OrigLoop->contains(UI)) {
3368       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3369       MissingVals[UI] = EndValue;
3370     }
3371   }
3372 
3373   // An external user of the penultimate value need to see EndValue - Step.
3374   // The simplest way to get this is to recompute it from the constituent SCEVs,
3375   // that is Start + (Step * (CRD - 1)).
3376   for (User *U : OrigPhi->users()) {
3377     auto *UI = cast<Instruction>(U);
3378     if (!OrigLoop->contains(UI)) {
3379       const DataLayout &DL =
3380           OrigLoop->getHeader()->getModule()->getDataLayout();
3381       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3382 
3383       IRBuilder<> B(MiddleBlock->getTerminator());
3384       Value *CountMinusOne = B.CreateSub(
3385           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3386       Value *CMO =
3387           !II.getStep()->getType()->isIntegerTy()
3388               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3389                              II.getStep()->getType())
3390               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3391       CMO->setName("cast.cmo");
3392       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3393       Escape->setName("ind.escape");
3394       MissingVals[UI] = Escape;
3395     }
3396   }
3397 
3398   for (auto &I : MissingVals) {
3399     PHINode *PHI = cast<PHINode>(I.first);
3400     // One corner case we have to handle is two IVs "chasing" each-other,
3401     // that is %IV2 = phi [...], [ %IV1, %latch ]
3402     // In this case, if IV1 has an external use, we need to avoid adding both
3403     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3404     // don't already have an incoming value for the middle block.
3405     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3406       PHI->addIncoming(I.second, MiddleBlock);
3407   }
3408 }
3409 
3410 namespace {
3411 
3412 struct CSEDenseMapInfo {
3413   static bool canHandle(const Instruction *I) {
3414     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3415            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3416   }
3417 
3418   static inline Instruction *getEmptyKey() {
3419     return DenseMapInfo<Instruction *>::getEmptyKey();
3420   }
3421 
3422   static inline Instruction *getTombstoneKey() {
3423     return DenseMapInfo<Instruction *>::getTombstoneKey();
3424   }
3425 
3426   static unsigned getHashValue(const Instruction *I) {
3427     assert(canHandle(I) && "Unknown instruction!");
3428     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3429                                                            I->value_op_end()));
3430   }
3431 
3432   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3433     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3434         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3435       return LHS == RHS;
3436     return LHS->isIdenticalTo(RHS);
3437   }
3438 };
3439 
3440 } // end anonymous namespace
3441 
3442 ///Perform cse of induction variable instructions.
3443 static void cse(BasicBlock *BB) {
3444   // Perform simple cse.
3445   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3446   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3447     Instruction *In = &*I++;
3448 
3449     if (!CSEDenseMapInfo::canHandle(In))
3450       continue;
3451 
3452     // Check if we can replace this instruction with any of the
3453     // visited instructions.
3454     if (Instruction *V = CSEMap.lookup(In)) {
3455       In->replaceAllUsesWith(V);
3456       In->eraseFromParent();
3457       continue;
3458     }
3459 
3460     CSEMap[In] = In;
3461   }
3462 }
3463 
3464 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3465                                                        ElementCount VF,
3466                                                        bool &NeedToScalarize) {
3467   assert(!VF.isScalable() && "scalable vectors not yet supported.");
3468   Function *F = CI->getCalledFunction();
3469   Type *ScalarRetTy = CI->getType();
3470   SmallVector<Type *, 4> Tys, ScalarTys;
3471   for (auto &ArgOp : CI->arg_operands())
3472     ScalarTys.push_back(ArgOp->getType());
3473 
3474   // Estimate cost of scalarized vector call. The source operands are assumed
3475   // to be vectors, so we need to extract individual elements from there,
3476   // execute VF scalar calls, and then gather the result into the vector return
3477   // value.
3478   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys,
3479                                                  TTI::TCK_RecipThroughput);
3480   if (VF.isScalar())
3481     return ScalarCallCost;
3482 
3483   // Compute corresponding vector type for return value and arguments.
3484   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3485   for (Type *ScalarTy : ScalarTys)
3486     Tys.push_back(ToVectorTy(ScalarTy, VF));
3487 
3488   // Compute costs of unpacking argument values for the scalar calls and
3489   // packing the return values to a vector.
3490   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3491 
3492   unsigned Cost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3493 
3494   // If we can't emit a vector call for this function, then the currently found
3495   // cost is the cost we need to return.
3496   NeedToScalarize = true;
3497   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3498   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3499 
3500   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3501     return Cost;
3502 
3503   // If the corresponding vector cost is cheaper, return its cost.
3504   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys,
3505                                                  TTI::TCK_RecipThroughput);
3506   if (VectorCallCost < Cost) {
3507     NeedToScalarize = false;
3508     return VectorCallCost;
3509   }
3510   return Cost;
3511 }
3512 
3513 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3514                                                             ElementCount VF) {
3515   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3516   assert(ID && "Expected intrinsic call!");
3517 
3518   IntrinsicCostAttributes CostAttrs(ID, *CI, VF);
3519   return TTI.getIntrinsicInstrCost(CostAttrs,
3520                                    TargetTransformInfo::TCK_RecipThroughput);
3521 }
3522 
3523 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3524   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3525   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3526   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3527 }
3528 
3529 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3530   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3531   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3532   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3533 }
3534 
3535 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3536   // For every instruction `I` in MinBWs, truncate the operands, create a
3537   // truncated version of `I` and reextend its result. InstCombine runs
3538   // later and will remove any ext/trunc pairs.
3539   SmallPtrSet<Value *, 4> Erased;
3540   for (const auto &KV : Cost->getMinimalBitwidths()) {
3541     // If the value wasn't vectorized, we must maintain the original scalar
3542     // type. The absence of the value from VectorLoopValueMap indicates that it
3543     // wasn't vectorized.
3544     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3545       continue;
3546     for (unsigned Part = 0; Part < UF; ++Part) {
3547       Value *I = getOrCreateVectorValue(KV.first, Part);
3548       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3549         continue;
3550       Type *OriginalTy = I->getType();
3551       Type *ScalarTruncatedTy =
3552           IntegerType::get(OriginalTy->getContext(), KV.second);
3553       auto *TruncatedTy = FixedVectorType::get(
3554           ScalarTruncatedTy,
3555           cast<FixedVectorType>(OriginalTy)->getNumElements());
3556       if (TruncatedTy == OriginalTy)
3557         continue;
3558 
3559       IRBuilder<> B(cast<Instruction>(I));
3560       auto ShrinkOperand = [&](Value *V) -> Value * {
3561         if (auto *ZI = dyn_cast<ZExtInst>(V))
3562           if (ZI->getSrcTy() == TruncatedTy)
3563             return ZI->getOperand(0);
3564         return B.CreateZExtOrTrunc(V, TruncatedTy);
3565       };
3566 
3567       // The actual instruction modification depends on the instruction type,
3568       // unfortunately.
3569       Value *NewI = nullptr;
3570       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3571         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3572                              ShrinkOperand(BO->getOperand(1)));
3573 
3574         // Any wrapping introduced by shrinking this operation shouldn't be
3575         // considered undefined behavior. So, we can't unconditionally copy
3576         // arithmetic wrapping flags to NewI.
3577         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3578       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3579         NewI =
3580             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3581                          ShrinkOperand(CI->getOperand(1)));
3582       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3583         NewI = B.CreateSelect(SI->getCondition(),
3584                               ShrinkOperand(SI->getTrueValue()),
3585                               ShrinkOperand(SI->getFalseValue()));
3586       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3587         switch (CI->getOpcode()) {
3588         default:
3589           llvm_unreachable("Unhandled cast!");
3590         case Instruction::Trunc:
3591           NewI = ShrinkOperand(CI->getOperand(0));
3592           break;
3593         case Instruction::SExt:
3594           NewI = B.CreateSExtOrTrunc(
3595               CI->getOperand(0),
3596               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3597           break;
3598         case Instruction::ZExt:
3599           NewI = B.CreateZExtOrTrunc(
3600               CI->getOperand(0),
3601               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3602           break;
3603         }
3604       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3605         auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType())
3606                              ->getNumElements();
3607         auto *O0 = B.CreateZExtOrTrunc(
3608             SI->getOperand(0),
3609             FixedVectorType::get(ScalarTruncatedTy, Elements0));
3610         auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType())
3611                              ->getNumElements();
3612         auto *O1 = B.CreateZExtOrTrunc(
3613             SI->getOperand(1),
3614             FixedVectorType::get(ScalarTruncatedTy, Elements1));
3615 
3616         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3617       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3618         // Don't do anything with the operands, just extend the result.
3619         continue;
3620       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3621         auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType())
3622                             ->getNumElements();
3623         auto *O0 = B.CreateZExtOrTrunc(
3624             IE->getOperand(0),
3625             FixedVectorType::get(ScalarTruncatedTy, Elements));
3626         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3627         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3628       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3629         auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType())
3630                             ->getNumElements();
3631         auto *O0 = B.CreateZExtOrTrunc(
3632             EE->getOperand(0),
3633             FixedVectorType::get(ScalarTruncatedTy, Elements));
3634         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3635       } else {
3636         // If we don't know what to do, be conservative and don't do anything.
3637         continue;
3638       }
3639 
3640       // Lastly, extend the result.
3641       NewI->takeName(cast<Instruction>(I));
3642       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3643       I->replaceAllUsesWith(Res);
3644       cast<Instruction>(I)->eraseFromParent();
3645       Erased.insert(I);
3646       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3647     }
3648   }
3649 
3650   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3651   for (const auto &KV : Cost->getMinimalBitwidths()) {
3652     // If the value wasn't vectorized, we must maintain the original scalar
3653     // type. The absence of the value from VectorLoopValueMap indicates that it
3654     // wasn't vectorized.
3655     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3656       continue;
3657     for (unsigned Part = 0; Part < UF; ++Part) {
3658       Value *I = getOrCreateVectorValue(KV.first, Part);
3659       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3660       if (Inst && Inst->use_empty()) {
3661         Value *NewI = Inst->getOperand(0);
3662         Inst->eraseFromParent();
3663         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3664       }
3665     }
3666   }
3667 }
3668 
3669 void InnerLoopVectorizer::fixVectorizedLoop() {
3670   // Insert truncates and extends for any truncated instructions as hints to
3671   // InstCombine.
3672   if (VF.isVector())
3673     truncateToMinimalBitwidths();
3674 
3675   // Fix widened non-induction PHIs by setting up the PHI operands.
3676   if (OrigPHIsToFix.size()) {
3677     assert(EnableVPlanNativePath &&
3678            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3679     fixNonInductionPHIs();
3680   }
3681 
3682   // At this point every instruction in the original loop is widened to a
3683   // vector form. Now we need to fix the recurrences in the loop. These PHI
3684   // nodes are currently empty because we did not want to introduce cycles.
3685   // This is the second stage of vectorizing recurrences.
3686   fixCrossIterationPHIs();
3687 
3688   // Forget the original basic block.
3689   PSE.getSE()->forgetLoop(OrigLoop);
3690 
3691   // Fix-up external users of the induction variables.
3692   for (auto &Entry : Legal->getInductionVars())
3693     fixupIVUsers(Entry.first, Entry.second,
3694                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3695                  IVEndValues[Entry.first], LoopMiddleBlock);
3696 
3697   fixLCSSAPHIs();
3698   for (Instruction *PI : PredicatedInstructions)
3699     sinkScalarOperands(&*PI);
3700 
3701   // Remove redundant induction instructions.
3702   cse(LoopVectorBody);
3703 
3704   // Set/update profile weights for the vector and remainder loops as original
3705   // loop iterations are now distributed among them. Note that original loop
3706   // represented by LoopScalarBody becomes remainder loop after vectorization.
3707   //
3708   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3709   // end up getting slightly roughened result but that should be OK since
3710   // profile is not inherently precise anyway. Note also possible bypass of
3711   // vector code caused by legality checks is ignored, assigning all the weight
3712   // to the vector loop, optimistically.
3713   assert(!VF.isScalable() &&
3714          "cannot use scalable ElementCount to determine unroll factor");
3715   setProfileInfoAfterUnrolling(
3716       LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
3717       LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
3718 }
3719 
3720 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3721   // In order to support recurrences we need to be able to vectorize Phi nodes.
3722   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3723   // stage #2: We now need to fix the recurrences by adding incoming edges to
3724   // the currently empty PHI nodes. At this point every instruction in the
3725   // original loop is widened to a vector form so we can use them to construct
3726   // the incoming edges.
3727   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3728     // Handle first-order recurrences and reductions that need to be fixed.
3729     if (Legal->isFirstOrderRecurrence(&Phi))
3730       fixFirstOrderRecurrence(&Phi);
3731     else if (Legal->isReductionVariable(&Phi))
3732       fixReduction(&Phi);
3733   }
3734 }
3735 
3736 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3737   // This is the second phase of vectorizing first-order recurrences. An
3738   // overview of the transformation is described below. Suppose we have the
3739   // following loop.
3740   //
3741   //   for (int i = 0; i < n; ++i)
3742   //     b[i] = a[i] - a[i - 1];
3743   //
3744   // There is a first-order recurrence on "a". For this loop, the shorthand
3745   // scalar IR looks like:
3746   //
3747   //   scalar.ph:
3748   //     s_init = a[-1]
3749   //     br scalar.body
3750   //
3751   //   scalar.body:
3752   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3753   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3754   //     s2 = a[i]
3755   //     b[i] = s2 - s1
3756   //     br cond, scalar.body, ...
3757   //
3758   // In this example, s1 is a recurrence because it's value depends on the
3759   // previous iteration. In the first phase of vectorization, we created a
3760   // temporary value for s1. We now complete the vectorization and produce the
3761   // shorthand vector IR shown below (for VF = 4, UF = 1).
3762   //
3763   //   vector.ph:
3764   //     v_init = vector(..., ..., ..., a[-1])
3765   //     br vector.body
3766   //
3767   //   vector.body
3768   //     i = phi [0, vector.ph], [i+4, vector.body]
3769   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3770   //     v2 = a[i, i+1, i+2, i+3];
3771   //     v3 = vector(v1(3), v2(0, 1, 2))
3772   //     b[i, i+1, i+2, i+3] = v2 - v3
3773   //     br cond, vector.body, middle.block
3774   //
3775   //   middle.block:
3776   //     x = v2(3)
3777   //     br scalar.ph
3778   //
3779   //   scalar.ph:
3780   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3781   //     br scalar.body
3782   //
3783   // After execution completes the vector loop, we extract the next value of
3784   // the recurrence (x) to use as the initial value in the scalar loop.
3785 
3786   // Get the original loop preheader and single loop latch.
3787   auto *Preheader = OrigLoop->getLoopPreheader();
3788   auto *Latch = OrigLoop->getLoopLatch();
3789 
3790   // Get the initial and previous values of the scalar recurrence.
3791   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3792   auto *Previous = Phi->getIncomingValueForBlock(Latch);
3793 
3794   // Create a vector from the initial value.
3795   auto *VectorInit = ScalarInit;
3796   if (VF.isVector()) {
3797     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3798     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
3799     VectorInit = Builder.CreateInsertElement(
3800         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3801         Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init");
3802   }
3803 
3804   // We constructed a temporary phi node in the first phase of vectorization.
3805   // This phi node will eventually be deleted.
3806   Builder.SetInsertPoint(
3807       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3808 
3809   // Create a phi node for the new recurrence. The current value will either be
3810   // the initial value inserted into a vector or loop-varying vector value.
3811   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3812   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3813 
3814   // Get the vectorized previous value of the last part UF - 1. It appears last
3815   // among all unrolled iterations, due to the order of their construction.
3816   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3817 
3818   // Find and set the insertion point after the previous value if it is an
3819   // instruction.
3820   BasicBlock::iterator InsertPt;
3821   // Note that the previous value may have been constant-folded so it is not
3822   // guaranteed to be an instruction in the vector loop.
3823   // FIXME: Loop invariant values do not form recurrences. We should deal with
3824   //        them earlier.
3825   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
3826     InsertPt = LoopVectorBody->getFirstInsertionPt();
3827   else {
3828     Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
3829     if (isa<PHINode>(PreviousLastPart))
3830       // If the previous value is a phi node, we should insert after all the phi
3831       // nodes in the block containing the PHI to avoid breaking basic block
3832       // verification. Note that the basic block may be different to
3833       // LoopVectorBody, in case we predicate the loop.
3834       InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
3835     else
3836       InsertPt = ++PreviousInst->getIterator();
3837   }
3838   Builder.SetInsertPoint(&*InsertPt);
3839 
3840   // We will construct a vector for the recurrence by combining the values for
3841   // the current and previous iterations. This is the required shuffle mask.
3842   assert(!VF.isScalable());
3843   SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue());
3844   ShuffleMask[0] = VF.getKnownMinValue() - 1;
3845   for (unsigned I = 1; I < VF.getKnownMinValue(); ++I)
3846     ShuffleMask[I] = I + VF.getKnownMinValue() - 1;
3847 
3848   // The vector from which to take the initial value for the current iteration
3849   // (actual or unrolled). Initially, this is the vector phi node.
3850   Value *Incoming = VecPhi;
3851 
3852   // Shuffle the current and previous vector and update the vector parts.
3853   for (unsigned Part = 0; Part < UF; ++Part) {
3854     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3855     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3856     auto *Shuffle =
3857         VF.isVector()
3858             ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask)
3859             : Incoming;
3860     PhiPart->replaceAllUsesWith(Shuffle);
3861     cast<Instruction>(PhiPart)->eraseFromParent();
3862     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3863     Incoming = PreviousPart;
3864   }
3865 
3866   // Fix the latch value of the new recurrence in the vector loop.
3867   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3868 
3869   // Extract the last vector element in the middle block. This will be the
3870   // initial value for the recurrence when jumping to the scalar loop.
3871   auto *ExtractForScalar = Incoming;
3872   if (VF.isVector()) {
3873     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3874     ExtractForScalar = Builder.CreateExtractElement(
3875         ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1),
3876         "vector.recur.extract");
3877   }
3878   // Extract the second last element in the middle block if the
3879   // Phi is used outside the loop. We need to extract the phi itself
3880   // and not the last element (the phi update in the current iteration). This
3881   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3882   // when the scalar loop is not run at all.
3883   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3884   if (VF.isVector())
3885     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3886         Incoming, Builder.getInt32(VF.getKnownMinValue() - 2),
3887         "vector.recur.extract.for.phi");
3888   // When loop is unrolled without vectorizing, initialize
3889   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3890   // `Incoming`. This is analogous to the vectorized case above: extracting the
3891   // second last element when VF > 1.
3892   else if (UF > 1)
3893     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3894 
3895   // Fix the initial value of the original recurrence in the scalar loop.
3896   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3897   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3898   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3899     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3900     Start->addIncoming(Incoming, BB);
3901   }
3902 
3903   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3904   Phi->setName("scalar.recur");
3905 
3906   // Finally, fix users of the recurrence outside the loop. The users will need
3907   // either the last value of the scalar recurrence or the last value of the
3908   // vector recurrence we extracted in the middle block. Since the loop is in
3909   // LCSSA form, we just need to find all the phi nodes for the original scalar
3910   // recurrence in the exit block, and then add an edge for the middle block.
3911   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3912     if (LCSSAPhi.getIncomingValue(0) == Phi) {
3913       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3914     }
3915   }
3916 }
3917 
3918 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3919   Constant *Zero = Builder.getInt32(0);
3920 
3921   // Get it's reduction variable descriptor.
3922   assert(Legal->isReductionVariable(Phi) &&
3923          "Unable to find the reduction variable");
3924   RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
3925 
3926   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3927   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3928   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3929   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3930     RdxDesc.getMinMaxRecurrenceKind();
3931   setDebugLocFromInst(Builder, ReductionStartValue);
3932   bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi);
3933 
3934   // We need to generate a reduction vector from the incoming scalar.
3935   // To do so, we need to generate the 'identity' vector and override
3936   // one of the elements with the incoming scalar reduction. We need
3937   // to do it in the vector-loop preheader.
3938   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3939 
3940   // This is the vector-clone of the value that leaves the loop.
3941   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3942 
3943   // Find the reduction identity variable. Zero for addition, or, xor,
3944   // one for multiplication, -1 for And.
3945   Value *Identity;
3946   Value *VectorStart;
3947   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3948       RK == RecurrenceDescriptor::RK_FloatMinMax) {
3949     // MinMax reduction have the start value as their identify.
3950     if (VF.isScalar() || IsInLoopReductionPhi) {
3951       VectorStart = Identity = ReductionStartValue;
3952     } else {
3953       VectorStart = Identity =
3954         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3955     }
3956   } else {
3957     // Handle other reduction kinds:
3958     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3959         RK, MinMaxKind, VecTy->getScalarType());
3960     if (VF.isScalar() || IsInLoopReductionPhi) {
3961       Identity = Iden;
3962       // This vector is the Identity vector where the first element is the
3963       // incoming scalar reduction.
3964       VectorStart = ReductionStartValue;
3965     } else {
3966       Identity = ConstantVector::getSplat(VF, Iden);
3967 
3968       // This vector is the Identity vector where the first element is the
3969       // incoming scalar reduction.
3970       VectorStart =
3971         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3972     }
3973   }
3974 
3975   // Wrap flags are in general invalid after vectorization, clear them.
3976   clearReductionWrapFlags(RdxDesc);
3977 
3978   // Fix the vector-loop phi.
3979 
3980   // Reductions do not have to start at zero. They can start with
3981   // any loop invariant values.
3982   BasicBlock *Latch = OrigLoop->getLoopLatch();
3983   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3984 
3985   for (unsigned Part = 0; Part < UF; ++Part) {
3986     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3987     Value *Val = getOrCreateVectorValue(LoopVal, Part);
3988     // Make sure to add the reduction start value only to the
3989     // first unroll part.
3990     Value *StartVal = (Part == 0) ? VectorStart : Identity;
3991     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3992     cast<PHINode>(VecRdxPhi)
3993       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3994   }
3995 
3996   // Before each round, move the insertion point right between
3997   // the PHIs and the values we are going to write.
3998   // This allows us to write both PHINodes and the extractelement
3999   // instructions.
4000   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4001 
4002   setDebugLocFromInst(Builder, LoopExitInst);
4003 
4004   // If tail is folded by masking, the vector value to leave the loop should be
4005   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
4006   // instead of the former. For an inloop reduction the reduction will already
4007   // be predicated, and does not need to be handled here.
4008   if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) {
4009     for (unsigned Part = 0; Part < UF; ++Part) {
4010       Value *VecLoopExitInst =
4011           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4012       Value *Sel = nullptr;
4013       for (User *U : VecLoopExitInst->users()) {
4014         if (isa<SelectInst>(U)) {
4015           assert(!Sel && "Reduction exit feeding two selects");
4016           Sel = U;
4017         } else
4018           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
4019       }
4020       assert(Sel && "Reduction exit feeds no select");
4021       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
4022 
4023       // If the target can create a predicated operator for the reduction at no
4024       // extra cost in the loop (for example a predicated vadd), it can be
4025       // cheaper for the select to remain in the loop than be sunk out of it,
4026       // and so use the select value for the phi instead of the old
4027       // LoopExitValue.
4028       RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
4029       if (PreferPredicatedReductionSelect ||
4030           TTI->preferPredicatedReductionSelect(
4031               RdxDesc.getRecurrenceBinOp(), Phi->getType(),
4032               TargetTransformInfo::ReductionFlags())) {
4033         auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part));
4034         VecRdxPhi->setIncomingValueForBlock(
4035             LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
4036       }
4037     }
4038   }
4039 
4040   // If the vector reduction can be performed in a smaller type, we truncate
4041   // then extend the loop exit value to enable InstCombine to evaluate the
4042   // entire expression in the smaller type.
4043   if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) {
4044     assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!");
4045     assert(!VF.isScalable() && "scalable vectors not yet supported.");
4046     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4047     Builder.SetInsertPoint(
4048         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
4049     VectorParts RdxParts(UF);
4050     for (unsigned Part = 0; Part < UF; ++Part) {
4051       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4052       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4053       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4054                                         : Builder.CreateZExt(Trunc, VecTy);
4055       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
4056            UI != RdxParts[Part]->user_end();)
4057         if (*UI != Trunc) {
4058           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
4059           RdxParts[Part] = Extnd;
4060         } else {
4061           ++UI;
4062         }
4063     }
4064     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4065     for (unsigned Part = 0; Part < UF; ++Part) {
4066       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4067       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
4068     }
4069   }
4070 
4071   // Reduce all of the unrolled parts into a single vector.
4072   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
4073   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
4074 
4075   // The middle block terminator has already been assigned a DebugLoc here (the
4076   // OrigLoop's single latch terminator). We want the whole middle block to
4077   // appear to execute on this line because: (a) it is all compiler generated,
4078   // (b) these instructions are always executed after evaluating the latch
4079   // conditional branch, and (c) other passes may add new predecessors which
4080   // terminate on this line. This is the easiest way to ensure we don't
4081   // accidentally cause an extra step back into the loop while debugging.
4082   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
4083   for (unsigned Part = 1; Part < UF; ++Part) {
4084     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4085     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
4086       // Floating point operations had to be 'fast' to enable the reduction.
4087       ReducedPartRdx = addFastMathFlag(
4088           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
4089                               ReducedPartRdx, "bin.rdx"),
4090           RdxDesc.getFastMathFlags());
4091     else
4092       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
4093                                       RdxPart);
4094   }
4095 
4096   // Create the reduction after the loop. Note that inloop reductions create the
4097   // target reduction in the loop using a Reduction recipe.
4098   if (VF.isVector() && !IsInLoopReductionPhi) {
4099     bool NoNaN = Legal->hasFunNoNaNAttr();
4100     ReducedPartRdx =
4101         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
4102     // If the reduction can be performed in a smaller type, we need to extend
4103     // the reduction to the wider type before we branch to the original loop.
4104     if (Phi->getType() != RdxDesc.getRecurrenceType())
4105       ReducedPartRdx =
4106         RdxDesc.isSigned()
4107         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
4108         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
4109   }
4110 
4111   // Create a phi node that merges control-flow from the backedge-taken check
4112   // block and the middle block.
4113   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
4114                                         LoopScalarPreHeader->getTerminator());
4115   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4116     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4117   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4118 
4119   // Now, we need to fix the users of the reduction variable
4120   // inside and outside of the scalar remainder loop.
4121   // We know that the loop is in LCSSA form. We need to update the
4122   // PHI nodes in the exit blocks.
4123   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4124     // All PHINodes need to have a single entry edge, or two if
4125     // we already fixed them.
4126     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
4127 
4128     // We found a reduction value exit-PHI. Update it with the
4129     // incoming bypass edge.
4130     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
4131       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4132   } // end of the LCSSA phi scan.
4133 
4134     // Fix the scalar loop reduction variable with the incoming reduction sum
4135     // from the vector body and from the backedge value.
4136   int IncomingEdgeBlockIdx =
4137     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4138   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4139   // Pick the other block.
4140   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4141   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4142   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4143 }
4144 
4145 void InnerLoopVectorizer::clearReductionWrapFlags(
4146     RecurrenceDescriptor &RdxDesc) {
4147   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
4148   if (RK != RecurrenceDescriptor::RK_IntegerAdd &&
4149       RK != RecurrenceDescriptor::RK_IntegerMult)
4150     return;
4151 
4152   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4153   assert(LoopExitInstr && "null loop exit instruction");
4154   SmallVector<Instruction *, 8> Worklist;
4155   SmallPtrSet<Instruction *, 8> Visited;
4156   Worklist.push_back(LoopExitInstr);
4157   Visited.insert(LoopExitInstr);
4158 
4159   while (!Worklist.empty()) {
4160     Instruction *Cur = Worklist.pop_back_val();
4161     if (isa<OverflowingBinaryOperator>(Cur))
4162       for (unsigned Part = 0; Part < UF; ++Part) {
4163         Value *V = getOrCreateVectorValue(Cur, Part);
4164         cast<Instruction>(V)->dropPoisonGeneratingFlags();
4165       }
4166 
4167     for (User *U : Cur->users()) {
4168       Instruction *UI = cast<Instruction>(U);
4169       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4170           Visited.insert(UI).second)
4171         Worklist.push_back(UI);
4172     }
4173   }
4174 }
4175 
4176 void InnerLoopVectorizer::fixLCSSAPHIs() {
4177   assert(!VF.isScalable() && "the code below assumes fixed width vectors");
4178   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4179     if (LCSSAPhi.getNumIncomingValues() == 1) {
4180       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4181       // Non-instruction incoming values will have only one value.
4182       unsigned LastLane = 0;
4183       if (isa<Instruction>(IncomingValue))
4184         LastLane = Cost->isUniformAfterVectorization(
4185                        cast<Instruction>(IncomingValue), VF)
4186                        ? 0
4187                        : VF.getKnownMinValue() - 1;
4188       // Can be a loop invariant incoming value or the last scalar value to be
4189       // extracted from the vectorized loop.
4190       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4191       Value *lastIncomingValue =
4192           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
4193       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4194     }
4195   }
4196 }
4197 
4198 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4199   // The basic block and loop containing the predicated instruction.
4200   auto *PredBB = PredInst->getParent();
4201   auto *VectorLoop = LI->getLoopFor(PredBB);
4202 
4203   // Initialize a worklist with the operands of the predicated instruction.
4204   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4205 
4206   // Holds instructions that we need to analyze again. An instruction may be
4207   // reanalyzed if we don't yet know if we can sink it or not.
4208   SmallVector<Instruction *, 8> InstsToReanalyze;
4209 
4210   // Returns true if a given use occurs in the predicated block. Phi nodes use
4211   // their operands in their corresponding predecessor blocks.
4212   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4213     auto *I = cast<Instruction>(U.getUser());
4214     BasicBlock *BB = I->getParent();
4215     if (auto *Phi = dyn_cast<PHINode>(I))
4216       BB = Phi->getIncomingBlock(
4217           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4218     return BB == PredBB;
4219   };
4220 
4221   // Iteratively sink the scalarized operands of the predicated instruction
4222   // into the block we created for it. When an instruction is sunk, it's
4223   // operands are then added to the worklist. The algorithm ends after one pass
4224   // through the worklist doesn't sink a single instruction.
4225   bool Changed;
4226   do {
4227     // Add the instructions that need to be reanalyzed to the worklist, and
4228     // reset the changed indicator.
4229     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4230     InstsToReanalyze.clear();
4231     Changed = false;
4232 
4233     while (!Worklist.empty()) {
4234       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4235 
4236       // We can't sink an instruction if it is a phi node, is already in the
4237       // predicated block, is not in the loop, or may have side effects.
4238       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
4239           !VectorLoop->contains(I) || I->mayHaveSideEffects())
4240         continue;
4241 
4242       // It's legal to sink the instruction if all its uses occur in the
4243       // predicated block. Otherwise, there's nothing to do yet, and we may
4244       // need to reanalyze the instruction.
4245       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4246         InstsToReanalyze.push_back(I);
4247         continue;
4248       }
4249 
4250       // Move the instruction to the beginning of the predicated block, and add
4251       // it's operands to the worklist.
4252       I->moveBefore(&*PredBB->getFirstInsertionPt());
4253       Worklist.insert(I->op_begin(), I->op_end());
4254 
4255       // The sinking may have enabled other instructions to be sunk, so we will
4256       // need to iterate.
4257       Changed = true;
4258     }
4259   } while (Changed);
4260 }
4261 
4262 void InnerLoopVectorizer::fixNonInductionPHIs() {
4263   for (PHINode *OrigPhi : OrigPHIsToFix) {
4264     PHINode *NewPhi =
4265         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
4266     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
4267 
4268     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
4269         predecessors(OrigPhi->getParent()));
4270     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
4271         predecessors(NewPhi->getParent()));
4272     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
4273            "Scalar and Vector BB should have the same number of predecessors");
4274 
4275     // The insertion point in Builder may be invalidated by the time we get
4276     // here. Force the Builder insertion point to something valid so that we do
4277     // not run into issues during insertion point restore in
4278     // getOrCreateVectorValue calls below.
4279     Builder.SetInsertPoint(NewPhi);
4280 
4281     // The predecessor order is preserved and we can rely on mapping between
4282     // scalar and vector block predecessors.
4283     for (unsigned i = 0; i < NumIncomingValues; ++i) {
4284       BasicBlock *NewPredBB = VectorBBPredecessors[i];
4285 
4286       // When looking up the new scalar/vector values to fix up, use incoming
4287       // values from original phi.
4288       Value *ScIncV =
4289           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
4290 
4291       // Scalar incoming value may need a broadcast
4292       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
4293       NewPhi->addIncoming(NewIncV, NewPredBB);
4294     }
4295   }
4296 }
4297 
4298 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef,
4299                                    VPUser &Operands, unsigned UF,
4300                                    ElementCount VF, bool IsPtrLoopInvariant,
4301                                    SmallBitVector &IsIndexLoopInvariant,
4302                                    VPTransformState &State) {
4303   // Construct a vector GEP by widening the operands of the scalar GEP as
4304   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4305   // results in a vector of pointers when at least one operand of the GEP
4306   // is vector-typed. Thus, to keep the representation compact, we only use
4307   // vector-typed operands for loop-varying values.
4308 
4309   if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4310     // If we are vectorizing, but the GEP has only loop-invariant operands,
4311     // the GEP we build (by only using vector-typed operands for
4312     // loop-varying values) would be a scalar pointer. Thus, to ensure we
4313     // produce a vector of pointers, we need to either arbitrarily pick an
4314     // operand to broadcast, or broadcast a clone of the original GEP.
4315     // Here, we broadcast a clone of the original.
4316     //
4317     // TODO: If at some point we decide to scalarize instructions having
4318     //       loop-invariant operands, this special case will no longer be
4319     //       required. We would add the scalarization decision to
4320     //       collectLoopScalars() and teach getVectorValue() to broadcast
4321     //       the lane-zero scalar value.
4322     auto *Clone = Builder.Insert(GEP->clone());
4323     for (unsigned Part = 0; Part < UF; ++Part) {
4324       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4325       State.set(VPDef, GEP, EntryPart, Part);
4326       addMetadata(EntryPart, GEP);
4327     }
4328   } else {
4329     // If the GEP has at least one loop-varying operand, we are sure to
4330     // produce a vector of pointers. But if we are only unrolling, we want
4331     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4332     // produce with the code below will be scalar (if VF == 1) or vector
4333     // (otherwise). Note that for the unroll-only case, we still maintain
4334     // values in the vector mapping with initVector, as we do for other
4335     // instructions.
4336     for (unsigned Part = 0; Part < UF; ++Part) {
4337       // The pointer operand of the new GEP. If it's loop-invariant, we
4338       // won't broadcast it.
4339       auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0})
4340                                      : State.get(Operands.getOperand(0), Part);
4341 
4342       // Collect all the indices for the new GEP. If any index is
4343       // loop-invariant, we won't broadcast it.
4344       SmallVector<Value *, 4> Indices;
4345       for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
4346         VPValue *Operand = Operands.getOperand(I);
4347         if (IsIndexLoopInvariant[I - 1])
4348           Indices.push_back(State.get(Operand, {0, 0}));
4349         else
4350           Indices.push_back(State.get(Operand, Part));
4351       }
4352 
4353       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4354       // but it should be a vector, otherwise.
4355       auto *NewGEP =
4356           GEP->isInBounds()
4357               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4358                                           Indices)
4359               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4360       assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
4361              "NewGEP is not a pointer vector");
4362       State.set(VPDef, GEP, NewGEP, Part);
4363       addMetadata(NewGEP, GEP);
4364     }
4365   }
4366 }
4367 
4368 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4369                                               ElementCount VF) {
4370   assert(!VF.isScalable() && "scalable vectors not yet supported.");
4371   PHINode *P = cast<PHINode>(PN);
4372   if (EnableVPlanNativePath) {
4373     // Currently we enter here in the VPlan-native path for non-induction
4374     // PHIs where all control flow is uniform. We simply widen these PHIs.
4375     // Create a vector phi with no operands - the vector phi operands will be
4376     // set at the end of vector code generation.
4377     Type *VecTy =
4378         (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF);
4379     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4380     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
4381     OrigPHIsToFix.push_back(P);
4382 
4383     return;
4384   }
4385 
4386   assert(PN->getParent() == OrigLoop->getHeader() &&
4387          "Non-header phis should have been handled elsewhere");
4388 
4389   // In order to support recurrences we need to be able to vectorize Phi nodes.
4390   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4391   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4392   // this value when we vectorize all of the instructions that use the PHI.
4393   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4394     for (unsigned Part = 0; Part < UF; ++Part) {
4395       // This is phase one of vectorizing PHIs.
4396       bool ScalarPHI =
4397           (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN));
4398       Type *VecTy =
4399           ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF);
4400       Value *EntryPart = PHINode::Create(
4401           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4402       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4403     }
4404     return;
4405   }
4406 
4407   setDebugLocFromInst(Builder, P);
4408 
4409   // This PHINode must be an induction variable.
4410   // Make sure that we know about it.
4411   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4412 
4413   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4414   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4415 
4416   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4417   // which can be found from the original scalar operations.
4418   switch (II.getKind()) {
4419   case InductionDescriptor::IK_NoInduction:
4420     llvm_unreachable("Unknown induction");
4421   case InductionDescriptor::IK_IntInduction:
4422   case InductionDescriptor::IK_FpInduction:
4423     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4424   case InductionDescriptor::IK_PtrInduction: {
4425     // Handle the pointer induction variable case.
4426     assert(P->getType()->isPointerTy() && "Unexpected type.");
4427 
4428     if (Cost->isScalarAfterVectorization(P, VF)) {
4429       // This is the normalized GEP that starts counting at zero.
4430       Value *PtrInd =
4431           Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
4432       // Determine the number of scalars we need to generate for each unroll
4433       // iteration. If the instruction is uniform, we only need to generate the
4434       // first lane. Otherwise, we generate all VF values.
4435       unsigned Lanes =
4436           Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue();
4437       for (unsigned Part = 0; Part < UF; ++Part) {
4438         for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4439           Constant *Idx = ConstantInt::get(PtrInd->getType(),
4440                                            Lane + Part * VF.getKnownMinValue());
4441           Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4442           Value *SclrGep =
4443               emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4444           SclrGep->setName("next.gep");
4445           VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4446         }
4447       }
4448       return;
4449     }
4450     assert(isa<SCEVConstant>(II.getStep()) &&
4451            "Induction step not a SCEV constant!");
4452     Type *PhiType = II.getStep()->getType();
4453 
4454     // Build a pointer phi
4455     Value *ScalarStartValue = II.getStartValue();
4456     Type *ScStValueType = ScalarStartValue->getType();
4457     PHINode *NewPointerPhi =
4458         PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
4459     NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4460 
4461     // A pointer induction, performed by using a gep
4462     BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4463     Instruction *InductionLoc = LoopLatch->getTerminator();
4464     const SCEV *ScalarStep = II.getStep();
4465     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4466     Value *ScalarStepValue =
4467         Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4468     Value *InductionGEP = GetElementPtrInst::Create(
4469         ScStValueType->getPointerElementType(), NewPointerPhi,
4470         Builder.CreateMul(
4471             ScalarStepValue,
4472             ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)),
4473         "ptr.ind", InductionLoc);
4474     NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4475 
4476     // Create UF many actual address geps that use the pointer
4477     // phi as base and a vectorized version of the step value
4478     // (<step*0, ..., step*N>) as offset.
4479     for (unsigned Part = 0; Part < UF; ++Part) {
4480       SmallVector<Constant *, 8> Indices;
4481       // Create a vector of consecutive numbers from zero to VF.
4482       for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
4483         Indices.push_back(
4484             ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue()));
4485       Constant *StartOffset = ConstantVector::get(Indices);
4486 
4487       Value *GEP = Builder.CreateGEP(
4488           ScStValueType->getPointerElementType(), NewPointerPhi,
4489           Builder.CreateMul(
4490               StartOffset,
4491               Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue),
4492               "vector.gep"));
4493       VectorLoopValueMap.setVectorValue(P, Part, GEP);
4494     }
4495   }
4496   }
4497 }
4498 
4499 /// A helper function for checking whether an integer division-related
4500 /// instruction may divide by zero (in which case it must be predicated if
4501 /// executed conditionally in the scalar code).
4502 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4503 /// Non-zero divisors that are non compile-time constants will not be
4504 /// converted into multiplication, so we will still end up scalarizing
4505 /// the division, but can do so w/o predication.
4506 static bool mayDivideByZero(Instruction &I) {
4507   assert((I.getOpcode() == Instruction::UDiv ||
4508           I.getOpcode() == Instruction::SDiv ||
4509           I.getOpcode() == Instruction::URem ||
4510           I.getOpcode() == Instruction::SRem) &&
4511          "Unexpected instruction");
4512   Value *Divisor = I.getOperand(1);
4513   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4514   return !CInt || CInt->isZero();
4515 }
4516 
4517 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
4518                                            VPUser &User,
4519                                            VPTransformState &State) {
4520   assert(!VF.isScalable() && "scalable vectors not yet supported.");
4521   switch (I.getOpcode()) {
4522   case Instruction::Call:
4523   case Instruction::Br:
4524   case Instruction::PHI:
4525   case Instruction::GetElementPtr:
4526   case Instruction::Select:
4527     llvm_unreachable("This instruction is handled by a different recipe.");
4528   case Instruction::UDiv:
4529   case Instruction::SDiv:
4530   case Instruction::SRem:
4531   case Instruction::URem:
4532   case Instruction::Add:
4533   case Instruction::FAdd:
4534   case Instruction::Sub:
4535   case Instruction::FSub:
4536   case Instruction::FNeg:
4537   case Instruction::Mul:
4538   case Instruction::FMul:
4539   case Instruction::FDiv:
4540   case Instruction::FRem:
4541   case Instruction::Shl:
4542   case Instruction::LShr:
4543   case Instruction::AShr:
4544   case Instruction::And:
4545   case Instruction::Or:
4546   case Instruction::Xor: {
4547     // Just widen unops and binops.
4548     setDebugLocFromInst(Builder, &I);
4549 
4550     for (unsigned Part = 0; Part < UF; ++Part) {
4551       SmallVector<Value *, 2> Ops;
4552       for (VPValue *VPOp : User.operands())
4553         Ops.push_back(State.get(VPOp, Part));
4554 
4555       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4556 
4557       if (auto *VecOp = dyn_cast<Instruction>(V))
4558         VecOp->copyIRFlags(&I);
4559 
4560       // Use this vector value for all users of the original instruction.
4561       State.set(Def, &I, V, Part);
4562       addMetadata(V, &I);
4563     }
4564 
4565     break;
4566   }
4567   case Instruction::ICmp:
4568   case Instruction::FCmp: {
4569     // Widen compares. Generate vector compares.
4570     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4571     auto *Cmp = cast<CmpInst>(&I);
4572     setDebugLocFromInst(Builder, Cmp);
4573     for (unsigned Part = 0; Part < UF; ++Part) {
4574       Value *A = State.get(User.getOperand(0), Part);
4575       Value *B = State.get(User.getOperand(1), Part);
4576       Value *C = nullptr;
4577       if (FCmp) {
4578         // Propagate fast math flags.
4579         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4580         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4581         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4582       } else {
4583         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4584       }
4585       State.set(Def, &I, C, Part);
4586       addMetadata(C, &I);
4587     }
4588 
4589     break;
4590   }
4591 
4592   case Instruction::ZExt:
4593   case Instruction::SExt:
4594   case Instruction::FPToUI:
4595   case Instruction::FPToSI:
4596   case Instruction::FPExt:
4597   case Instruction::PtrToInt:
4598   case Instruction::IntToPtr:
4599   case Instruction::SIToFP:
4600   case Instruction::UIToFP:
4601   case Instruction::Trunc:
4602   case Instruction::FPTrunc:
4603   case Instruction::BitCast: {
4604     auto *CI = cast<CastInst>(&I);
4605     setDebugLocFromInst(Builder, CI);
4606 
4607     /// Vectorize casts.
4608     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
4609     Type *DestTy =
4610         (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF);
4611 
4612     for (unsigned Part = 0; Part < UF; ++Part) {
4613       Value *A = State.get(User.getOperand(0), Part);
4614       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4615       State.set(Def, &I, Cast, Part);
4616       addMetadata(Cast, &I);
4617     }
4618     break;
4619   }
4620   default:
4621     // This instruction is not vectorized by simple widening.
4622     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4623     llvm_unreachable("Unhandled instruction!");
4624   } // end of switch.
4625 }
4626 
4627 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
4628                                                VPUser &ArgOperands,
4629                                                VPTransformState &State) {
4630   assert(!isa<DbgInfoIntrinsic>(I) &&
4631          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4632   setDebugLocFromInst(Builder, &I);
4633 
4634   Module *M = I.getParent()->getParent()->getParent();
4635   auto *CI = cast<CallInst>(&I);
4636 
4637   SmallVector<Type *, 4> Tys;
4638   for (Value *ArgOperand : CI->arg_operands())
4639     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4640 
4641   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4642 
4643   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4644   // version of the instruction.
4645   // Is it beneficial to perform intrinsic call compared to lib call?
4646   bool NeedToScalarize = false;
4647   unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4648   bool UseVectorIntrinsic =
4649       ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4650   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4651          "Instruction should be scalarized elsewhere.");
4652 
4653   for (unsigned Part = 0; Part < UF; ++Part) {
4654     SmallVector<Value *, 4> Args;
4655     for (auto &I : enumerate(ArgOperands.operands())) {
4656       // Some intrinsics have a scalar argument - don't replace it with a
4657       // vector.
4658       Value *Arg;
4659       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4660         Arg = State.get(I.value(), Part);
4661       else
4662         Arg = State.get(I.value(), {0, 0});
4663       Args.push_back(Arg);
4664     }
4665 
4666     Function *VectorF;
4667     if (UseVectorIntrinsic) {
4668       // Use vector version of the intrinsic.
4669       Type *TysForDecl[] = {CI->getType()};
4670       if (VF.isVector()) {
4671         assert(!VF.isScalable() && "VF is assumed to be non scalable.");
4672         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4673       }
4674       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4675       assert(VectorF && "Can't retrieve vector intrinsic.");
4676     } else {
4677       // Use vector version of the function call.
4678       const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
4679 #ifndef NDEBUG
4680       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
4681              "Can't create vector function.");
4682 #endif
4683         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4684     }
4685       SmallVector<OperandBundleDef, 1> OpBundles;
4686       CI->getOperandBundlesAsDefs(OpBundles);
4687       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4688 
4689       if (isa<FPMathOperator>(V))
4690         V->copyFastMathFlags(CI);
4691 
4692       State.set(Def, &I, V, Part);
4693       addMetadata(V, &I);
4694   }
4695 }
4696 
4697 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef,
4698                                                  VPUser &Operands,
4699                                                  bool InvariantCond,
4700                                                  VPTransformState &State) {
4701   setDebugLocFromInst(Builder, &I);
4702 
4703   // The condition can be loop invariant  but still defined inside the
4704   // loop. This means that we can't just use the original 'cond' value.
4705   // We have to take the 'vectorized' value and pick the first lane.
4706   // Instcombine will make this a no-op.
4707   auto *InvarCond =
4708       InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr;
4709 
4710   for (unsigned Part = 0; Part < UF; ++Part) {
4711     Value *Cond =
4712         InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
4713     Value *Op0 = State.get(Operands.getOperand(1), Part);
4714     Value *Op1 = State.get(Operands.getOperand(2), Part);
4715     Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
4716     State.set(VPDef, &I, Sel, Part);
4717     addMetadata(Sel, &I);
4718   }
4719 }
4720 
4721 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4722   // We should not collect Scalars more than once per VF. Right now, this
4723   // function is called from collectUniformsAndScalars(), which already does
4724   // this check. Collecting Scalars for VF=1 does not make any sense.
4725   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
4726          "This function should not be visited twice for the same VF");
4727 
4728   SmallSetVector<Instruction *, 8> Worklist;
4729 
4730   // These sets are used to seed the analysis with pointers used by memory
4731   // accesses that will remain scalar.
4732   SmallSetVector<Instruction *, 8> ScalarPtrs;
4733   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4734   auto *Latch = TheLoop->getLoopLatch();
4735 
4736   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4737   // The pointer operands of loads and stores will be scalar as long as the
4738   // memory access is not a gather or scatter operation. The value operand of a
4739   // store will remain scalar if the store is scalarized.
4740   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4741     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4742     assert(WideningDecision != CM_Unknown &&
4743            "Widening decision should be ready at this moment");
4744     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4745       if (Ptr == Store->getValueOperand())
4746         return WideningDecision == CM_Scalarize;
4747     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4748            "Ptr is neither a value or pointer operand");
4749     return WideningDecision != CM_GatherScatter;
4750   };
4751 
4752   // A helper that returns true if the given value is a bitcast or
4753   // getelementptr instruction contained in the loop.
4754   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4755     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4756             isa<GetElementPtrInst>(V)) &&
4757            !TheLoop->isLoopInvariant(V);
4758   };
4759 
4760   auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
4761     if (!isa<PHINode>(Ptr) ||
4762         !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
4763       return false;
4764     auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
4765     if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
4766       return false;
4767     return isScalarUse(MemAccess, Ptr);
4768   };
4769 
4770   // A helper that evaluates a memory access's use of a pointer. If the
4771   // pointer is actually the pointer induction of a loop, it is being
4772   // inserted into Worklist. If the use will be a scalar use, and the
4773   // pointer is only used by memory accesses, we place the pointer in
4774   // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
4775   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4776     if (isScalarPtrInduction(MemAccess, Ptr)) {
4777       Worklist.insert(cast<Instruction>(Ptr));
4778       Instruction *Update = cast<Instruction>(
4779           cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
4780       Worklist.insert(Update);
4781       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
4782                         << "\n");
4783       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update
4784                         << "\n");
4785       return;
4786     }
4787     // We only care about bitcast and getelementptr instructions contained in
4788     // the loop.
4789     if (!isLoopVaryingBitCastOrGEP(Ptr))
4790       return;
4791 
4792     // If the pointer has already been identified as scalar (e.g., if it was
4793     // also identified as uniform), there's nothing to do.
4794     auto *I = cast<Instruction>(Ptr);
4795     if (Worklist.count(I))
4796       return;
4797 
4798     // If the use of the pointer will be a scalar use, and all users of the
4799     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4800     // place the pointer in PossibleNonScalarPtrs.
4801     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4802           return isa<LoadInst>(U) || isa<StoreInst>(U);
4803         }))
4804       ScalarPtrs.insert(I);
4805     else
4806       PossibleNonScalarPtrs.insert(I);
4807   };
4808 
4809   // We seed the scalars analysis with three classes of instructions: (1)
4810   // instructions marked uniform-after-vectorization and (2) bitcast,
4811   // getelementptr and (pointer) phi instructions used by memory accesses
4812   // requiring a scalar use.
4813   //
4814   // (1) Add to the worklist all instructions that have been identified as
4815   // uniform-after-vectorization.
4816   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4817 
4818   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4819   // memory accesses requiring a scalar use. The pointer operands of loads and
4820   // stores will be scalar as long as the memory accesses is not a gather or
4821   // scatter operation. The value operand of a store will remain scalar if the
4822   // store is scalarized.
4823   for (auto *BB : TheLoop->blocks())
4824     for (auto &I : *BB) {
4825       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4826         evaluatePtrUse(Load, Load->getPointerOperand());
4827       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4828         evaluatePtrUse(Store, Store->getPointerOperand());
4829         evaluatePtrUse(Store, Store->getValueOperand());
4830       }
4831     }
4832   for (auto *I : ScalarPtrs)
4833     if (!PossibleNonScalarPtrs.count(I)) {
4834       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4835       Worklist.insert(I);
4836     }
4837 
4838   // Insert the forced scalars.
4839   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4840   // induction variable when the PHI user is scalarized.
4841   auto ForcedScalar = ForcedScalars.find(VF);
4842   if (ForcedScalar != ForcedScalars.end())
4843     for (auto *I : ForcedScalar->second)
4844       Worklist.insert(I);
4845 
4846   // Expand the worklist by looking through any bitcasts and getelementptr
4847   // instructions we've already identified as scalar. This is similar to the
4848   // expansion step in collectLoopUniforms(); however, here we're only
4849   // expanding to include additional bitcasts and getelementptr instructions.
4850   unsigned Idx = 0;
4851   while (Idx != Worklist.size()) {
4852     Instruction *Dst = Worklist[Idx++];
4853     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4854       continue;
4855     auto *Src = cast<Instruction>(Dst->getOperand(0));
4856     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4857           auto *J = cast<Instruction>(U);
4858           return !TheLoop->contains(J) || Worklist.count(J) ||
4859                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4860                   isScalarUse(J, Src));
4861         })) {
4862       Worklist.insert(Src);
4863       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4864     }
4865   }
4866 
4867   // An induction variable will remain scalar if all users of the induction
4868   // variable and induction variable update remain scalar.
4869   for (auto &Induction : Legal->getInductionVars()) {
4870     auto *Ind = Induction.first;
4871     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4872 
4873     // If tail-folding is applied, the primary induction variable will be used
4874     // to feed a vector compare.
4875     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4876       continue;
4877 
4878     // Determine if all users of the induction variable are scalar after
4879     // vectorization.
4880     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4881       auto *I = cast<Instruction>(U);
4882       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4883     });
4884     if (!ScalarInd)
4885       continue;
4886 
4887     // Determine if all users of the induction variable update instruction are
4888     // scalar after vectorization.
4889     auto ScalarIndUpdate =
4890         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4891           auto *I = cast<Instruction>(U);
4892           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4893         });
4894     if (!ScalarIndUpdate)
4895       continue;
4896 
4897     // The induction variable and its update instruction will remain scalar.
4898     Worklist.insert(Ind);
4899     Worklist.insert(IndUpdate);
4900     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4901     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4902                       << "\n");
4903   }
4904 
4905   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4906 }
4907 
4908 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I,
4909                                                          ElementCount VF) {
4910   assert(!VF.isScalable() && "scalable vectors not yet supported.");
4911   if (!blockNeedsPredication(I->getParent()))
4912     return false;
4913   switch(I->getOpcode()) {
4914   default:
4915     break;
4916   case Instruction::Load:
4917   case Instruction::Store: {
4918     if (!Legal->isMaskRequired(I))
4919       return false;
4920     auto *Ptr = getLoadStorePointerOperand(I);
4921     auto *Ty = getMemInstValueType(I);
4922     // We have already decided how to vectorize this instruction, get that
4923     // result.
4924     if (VF.isVector()) {
4925       InstWidening WideningDecision = getWideningDecision(I, VF);
4926       assert(WideningDecision != CM_Unknown &&
4927              "Widening decision should be ready at this moment");
4928       return WideningDecision == CM_Scalarize;
4929     }
4930     const Align Alignment = getLoadStoreAlignment(I);
4931     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4932                                 isLegalMaskedGather(Ty, Alignment))
4933                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4934                                 isLegalMaskedScatter(Ty, Alignment));
4935   }
4936   case Instruction::UDiv:
4937   case Instruction::SDiv:
4938   case Instruction::SRem:
4939   case Instruction::URem:
4940     return mayDivideByZero(*I);
4941   }
4942   return false;
4943 }
4944 
4945 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4946     Instruction *I, ElementCount VF) {
4947   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4948   assert(getWideningDecision(I, VF) == CM_Unknown &&
4949          "Decision should not be set yet.");
4950   auto *Group = getInterleavedAccessGroup(I);
4951   assert(Group && "Must have a group.");
4952 
4953   // If the instruction's allocated size doesn't equal it's type size, it
4954   // requires padding and will be scalarized.
4955   auto &DL = I->getModule()->getDataLayout();
4956   auto *ScalarTy = getMemInstValueType(I);
4957   if (hasIrregularType(ScalarTy, DL, VF))
4958     return false;
4959 
4960   // Check if masking is required.
4961   // A Group may need masking for one of two reasons: it resides in a block that
4962   // needs predication, or it was decided to use masking to deal with gaps.
4963   bool PredicatedAccessRequiresMasking =
4964       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4965   bool AccessWithGapsRequiresMasking =
4966       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4967   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4968     return true;
4969 
4970   // If masked interleaving is required, we expect that the user/target had
4971   // enabled it, because otherwise it either wouldn't have been created or
4972   // it should have been invalidated by the CostModel.
4973   assert(useMaskedInterleavedAccesses(TTI) &&
4974          "Masked interleave-groups for predicated accesses are not enabled.");
4975 
4976   auto *Ty = getMemInstValueType(I);
4977   const Align Alignment = getLoadStoreAlignment(I);
4978   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4979                           : TTI.isLegalMaskedStore(Ty, Alignment);
4980 }
4981 
4982 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4983     Instruction *I, ElementCount VF) {
4984   // Get and ensure we have a valid memory instruction.
4985   LoadInst *LI = dyn_cast<LoadInst>(I);
4986   StoreInst *SI = dyn_cast<StoreInst>(I);
4987   assert((LI || SI) && "Invalid memory instruction");
4988 
4989   auto *Ptr = getLoadStorePointerOperand(I);
4990 
4991   // In order to be widened, the pointer should be consecutive, first of all.
4992   if (!Legal->isConsecutivePtr(Ptr))
4993     return false;
4994 
4995   // If the instruction is a store located in a predicated block, it will be
4996   // scalarized.
4997   if (isScalarWithPredication(I))
4998     return false;
4999 
5000   // If the instruction's allocated size doesn't equal it's type size, it
5001   // requires padding and will be scalarized.
5002   auto &DL = I->getModule()->getDataLayout();
5003   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
5004   if (hasIrregularType(ScalarTy, DL, VF))
5005     return false;
5006 
5007   return true;
5008 }
5009 
5010 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
5011   // We should not collect Uniforms more than once per VF. Right now,
5012   // this function is called from collectUniformsAndScalars(), which
5013   // already does this check. Collecting Uniforms for VF=1 does not make any
5014   // sense.
5015 
5016   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
5017          "This function should not be visited twice for the same VF");
5018 
5019   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
5020   // not analyze again.  Uniforms.count(VF) will return 1.
5021   Uniforms[VF].clear();
5022 
5023   // We now know that the loop is vectorizable!
5024   // Collect instructions inside the loop that will remain uniform after
5025   // vectorization.
5026 
5027   // Global values, params and instructions outside of current loop are out of
5028   // scope.
5029   auto isOutOfScope = [&](Value *V) -> bool {
5030     Instruction *I = dyn_cast<Instruction>(V);
5031     return (!I || !TheLoop->contains(I));
5032   };
5033 
5034   SetVector<Instruction *> Worklist;
5035   BasicBlock *Latch = TheLoop->getLoopLatch();
5036 
5037   // Instructions that are scalar with predication must not be considered
5038   // uniform after vectorization, because that would create an erroneous
5039   // replicating region where only a single instance out of VF should be formed.
5040   // TODO: optimize such seldom cases if found important, see PR40816.
5041   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
5042     if (isOutOfScope(I)) {
5043       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
5044                         << *I << "\n");
5045       return;
5046     }
5047     if (isScalarWithPredication(I, VF)) {
5048       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
5049                         << *I << "\n");
5050       return;
5051     }
5052     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
5053     Worklist.insert(I);
5054   };
5055 
5056   // Start with the conditional branch. If the branch condition is an
5057   // instruction contained in the loop that is only used by the branch, it is
5058   // uniform.
5059   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
5060   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
5061     addToWorklistIfAllowed(Cmp);
5062 
5063   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
5064   // are pointers that are treated like consecutive pointers during
5065   // vectorization. The pointer operands of interleaved accesses are an
5066   // example.
5067   SmallSetVector<Value *, 8> ConsecutiveLikePtrs;
5068 
5069   // Holds pointer operands of instructions that are possibly non-uniform.
5070   SmallPtrSet<Value *, 8> PossibleNonUniformPtrs;
5071 
5072   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
5073     InstWidening WideningDecision = getWideningDecision(I, VF);
5074     assert(WideningDecision != CM_Unknown &&
5075            "Widening decision should be ready at this moment");
5076 
5077     // The address of a uniform mem op is itself uniform.  We exclude stores
5078     // here as there's an assumption in the current code that all uses of
5079     // uniform instructions are uniform and, as noted below, uniform stores are
5080     // still handled via replication (i.e. aren't uniform after vectorization).
5081     if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
5082       assert(WideningDecision == CM_Scalarize);
5083       return true;
5084     }
5085 
5086     return (WideningDecision == CM_Widen ||
5087             WideningDecision == CM_Widen_Reverse ||
5088             WideningDecision == CM_Interleave);
5089   };
5090 
5091 
5092   // Returns true if Ptr is the pointer operand of a memory access instruction
5093   // I, and I is known to not require scalarization.
5094   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5095     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
5096   };
5097 
5098   // Iterate over the instructions in the loop, and collect all
5099   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
5100   // that a consecutive-like pointer operand will be scalarized, we collect it
5101   // in PossibleNonUniformPtrs instead. We use two sets here because a single
5102   // getelementptr instruction can be used by both vectorized and scalarized
5103   // memory instructions. For example, if a loop loads and stores from the same
5104   // location, but the store is conditional, the store will be scalarized, and
5105   // the getelementptr won't remain uniform.
5106   for (auto *BB : TheLoop->blocks())
5107     for (auto &I : *BB) {
5108       // If there's no pointer operand, there's nothing to do.
5109       auto *Ptr = getLoadStorePointerOperand(&I);
5110       if (!Ptr)
5111         continue;
5112 
5113       // For now, avoid walking use lists in other functions.
5114       // TODO: Rewrite this algorithm from uses up.
5115       if (!isa<Instruction>(Ptr) && !isa<Argument>(Ptr))
5116         continue;
5117 
5118       // A uniform memory op is itself uniform.  We exclude stores here as we
5119       // haven't yet added dedicated logic in the CLONE path and rely on
5120       // REPLICATE + DSE for correctness.
5121       if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
5122         addToWorklistIfAllowed(&I);
5123 
5124       // True if all users of Ptr are memory accesses that have Ptr as their
5125       // pointer operand.  Since loops are assumed to be in LCSSA form, this
5126       // disallows uses outside the loop as well.
5127       auto UsersAreMemAccesses =
5128           llvm::all_of(Ptr->users(), [&](User *U) -> bool {
5129             return getLoadStorePointerOperand(U) == Ptr;
5130           });
5131 
5132       // Ensure the memory instruction will not be scalarized or used by
5133       // gather/scatter, making its pointer operand non-uniform. If the pointer
5134       // operand is used by any instruction other than a memory access, we
5135       // conservatively assume the pointer operand may be non-uniform.
5136       if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
5137         PossibleNonUniformPtrs.insert(Ptr);
5138 
5139       // If the memory instruction will be vectorized and its pointer operand
5140       // is consecutive-like, or interleaving - the pointer operand should
5141       // remain uniform.
5142       else
5143         ConsecutiveLikePtrs.insert(Ptr);
5144     }
5145 
5146   // Add to the Worklist all consecutive and consecutive-like pointers that
5147   // aren't also identified as possibly non-uniform.
5148   for (auto *V : ConsecutiveLikePtrs)
5149     if (!PossibleNonUniformPtrs.count(V))
5150       if (auto *I = dyn_cast<Instruction>(V))
5151         addToWorklistIfAllowed(I);
5152 
5153   // Expand Worklist in topological order: whenever a new instruction
5154   // is added , its users should be already inside Worklist.  It ensures
5155   // a uniform instruction will only be used by uniform instructions.
5156   unsigned idx = 0;
5157   while (idx != Worklist.size()) {
5158     Instruction *I = Worklist[idx++];
5159 
5160     for (auto OV : I->operand_values()) {
5161       // isOutOfScope operands cannot be uniform instructions.
5162       if (isOutOfScope(OV))
5163         continue;
5164       // First order recurrence Phi's should typically be considered
5165       // non-uniform.
5166       auto *OP = dyn_cast<PHINode>(OV);
5167       if (OP && Legal->isFirstOrderRecurrence(OP))
5168         continue;
5169       // If all the users of the operand are uniform, then add the
5170       // operand into the uniform worklist.
5171       auto *OI = cast<Instruction>(OV);
5172       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
5173             auto *J = cast<Instruction>(U);
5174             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
5175           }))
5176         addToWorklistIfAllowed(OI);
5177     }
5178   }
5179 
5180   // For an instruction to be added into Worklist above, all its users inside
5181   // the loop should also be in Worklist. However, this condition cannot be
5182   // true for phi nodes that form a cyclic dependence. We must process phi
5183   // nodes separately. An induction variable will remain uniform if all users
5184   // of the induction variable and induction variable update remain uniform.
5185   // The code below handles both pointer and non-pointer induction variables.
5186   for (auto &Induction : Legal->getInductionVars()) {
5187     auto *Ind = Induction.first;
5188     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5189 
5190     // Determine if all users of the induction variable are uniform after
5191     // vectorization.
5192     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5193       auto *I = cast<Instruction>(U);
5194       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
5195              isVectorizedMemAccessUse(I, Ind);
5196     });
5197     if (!UniformInd)
5198       continue;
5199 
5200     // Determine if all users of the induction variable update instruction are
5201     // uniform after vectorization.
5202     auto UniformIndUpdate =
5203         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5204           auto *I = cast<Instruction>(U);
5205           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
5206                  isVectorizedMemAccessUse(I, IndUpdate);
5207         });
5208     if (!UniformIndUpdate)
5209       continue;
5210 
5211     // The induction variable and its update instruction will remain uniform.
5212     addToWorklistIfAllowed(Ind);
5213     addToWorklistIfAllowed(IndUpdate);
5214   }
5215 
5216   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
5217 }
5218 
5219 bool LoopVectorizationCostModel::runtimeChecksRequired() {
5220   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
5221 
5222   if (Legal->getRuntimePointerChecking()->Need) {
5223     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
5224         "runtime pointer checks needed. Enable vectorization of this "
5225         "loop with '#pragma clang loop vectorize(enable)' when "
5226         "compiling with -Os/-Oz",
5227         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5228     return true;
5229   }
5230 
5231   if (!PSE.getUnionPredicate().getPredicates().empty()) {
5232     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
5233         "runtime SCEV checks needed. Enable vectorization of this "
5234         "loop with '#pragma clang loop vectorize(enable)' when "
5235         "compiling with -Os/-Oz",
5236         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5237     return true;
5238   }
5239 
5240   // FIXME: Avoid specializing for stride==1 instead of bailing out.
5241   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
5242     reportVectorizationFailure("Runtime stride check for small trip count",
5243         "runtime stride == 1 checks needed. Enable vectorization of "
5244         "this loop without such check by compiling with -Os/-Oz",
5245         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5246     return true;
5247   }
5248 
5249   return false;
5250 }
5251 
5252 Optional<ElementCount>
5253 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5254   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5255     // TODO: It may by useful to do since it's still likely to be dynamically
5256     // uniform if the target can skip.
5257     reportVectorizationFailure(
5258         "Not inserting runtime ptr check for divergent target",
5259         "runtime pointer checks needed. Not enabled for divergent target",
5260         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5261     return None;
5262   }
5263 
5264   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5265   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5266   if (TC == 1) {
5267     reportVectorizationFailure("Single iteration (non) loop",
5268         "loop trip count is one, irrelevant for vectorization",
5269         "SingleIterationLoop", ORE, TheLoop);
5270     return None;
5271   }
5272 
5273   ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF);
5274 
5275   switch (ScalarEpilogueStatus) {
5276   case CM_ScalarEpilogueAllowed:
5277     return MaxVF;
5278   case CM_ScalarEpilogueNotNeededUsePredicate:
5279     LLVM_DEBUG(
5280         dbgs() << "LV: vector predicate hint/switch found.\n"
5281                << "LV: Not allowing scalar epilogue, creating predicated "
5282                << "vector loop.\n");
5283     break;
5284   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5285     // fallthrough as a special case of OptForSize
5286   case CM_ScalarEpilogueNotAllowedOptSize:
5287     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5288       LLVM_DEBUG(
5289           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5290     else
5291       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5292                         << "count.\n");
5293 
5294     // Bail if runtime checks are required, which are not good when optimising
5295     // for size.
5296     if (runtimeChecksRequired())
5297       return None;
5298     break;
5299   }
5300 
5301   // Now try the tail folding
5302 
5303   // Invalidate interleave groups that require an epilogue if we can't mask
5304   // the interleave-group.
5305   if (!useMaskedInterleavedAccesses(TTI)) {
5306     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5307            "No decisions should have been taken at this point");
5308     // Note: There is no need to invalidate any cost modeling decisions here, as
5309     // non where taken so far.
5310     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5311   }
5312 
5313   assert(!MaxVF.isScalable() &&
5314          "Scalable vectors do not yet support tail folding");
5315   assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) &&
5316          "MaxVF must be a power of 2");
5317   unsigned MaxVFtimesIC =
5318       UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue();
5319   if (TC > 0 && TC % MaxVFtimesIC == 0) {
5320     // Accept MaxVF if we do not have a tail.
5321     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5322     return MaxVF;
5323   }
5324 
5325   // If we don't know the precise trip count, or if the trip count that we
5326   // found modulo the vectorization factor is not zero, try to fold the tail
5327   // by masking.
5328   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5329   if (Legal->prepareToFoldTailByMasking()) {
5330     FoldTailByMasking = true;
5331     return MaxVF;
5332   }
5333 
5334   // If there was a tail-folding hint/switch, but we can't fold the tail by
5335   // masking, fallback to a vectorization with a scalar epilogue.
5336   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5337     if (PreferPredicateOverEpilogue == PreferPredicateTy::PredicateOrDontVectorize) {
5338       LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5339       return None;
5340     }
5341     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5342                          "scalar epilogue instead.\n");
5343     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5344     return MaxVF;
5345   }
5346 
5347   if (TC == 0) {
5348     reportVectorizationFailure(
5349         "Unable to calculate the loop count due to complex control flow",
5350         "unable to calculate the loop count due to complex control flow",
5351         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5352     return None;
5353   }
5354 
5355   reportVectorizationFailure(
5356       "Cannot optimize for size and vectorize at the same time.",
5357       "cannot optimize for size and vectorize at the same time. "
5358       "Enable vectorization of this loop with '#pragma clang loop "
5359       "vectorize(enable)' when compiling with -Os/-Oz",
5360       "NoTailLoopWithOptForSize", ORE, TheLoop);
5361   return None;
5362 }
5363 
5364 ElementCount
5365 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
5366                                                  ElementCount UserVF) {
5367   assert(!UserVF.isScalable() && "scalable vectorization not yet handled");
5368   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5369   unsigned SmallestType, WidestType;
5370   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5371   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
5372 
5373   // Get the maximum safe dependence distance in bits computed by LAA.
5374   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5375   // the memory accesses that is most restrictive (involved in the smallest
5376   // dependence distance).
5377   unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits();
5378 
5379   if (UserVF.isNonZero()) {
5380     // If legally unsafe, clamp the user vectorization factor to a safe value.
5381     unsigned MaxSafeVF = PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType);
5382     if (UserVF.getFixedValue() <= MaxSafeVF)
5383       return UserVF;
5384 
5385     LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5386                       << " is unsafe, clamping to max safe VF=" << MaxSafeVF
5387                       << ".\n");
5388     ORE->emit([&]() {
5389       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5390                                         TheLoop->getStartLoc(),
5391                                         TheLoop->getHeader())
5392              << "User-specified vectorization factor "
5393              << ore::NV("UserVectorizationFactor", UserVF)
5394              << " is unsafe, clamping to maximum safe vectorization factor "
5395              << ore::NV("VectorizationFactor", MaxSafeVF);
5396     });
5397     return ElementCount::getFixed(MaxSafeVF);
5398   }
5399 
5400   WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits);
5401 
5402   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5403   // Note that both WidestRegister and WidestType may not be a powers of 2.
5404   unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType);
5405 
5406   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5407                     << " / " << WidestType << " bits.\n");
5408   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5409                     << WidestRegister << " bits.\n");
5410 
5411   assert(MaxVectorSize <= WidestRegister &&
5412          "Did not expect to pack so many elements"
5413          " into one vector!");
5414   if (MaxVectorSize == 0) {
5415     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
5416     MaxVectorSize = 1;
5417     return ElementCount::getFixed(MaxVectorSize);
5418   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
5419              isPowerOf2_32(ConstTripCount)) {
5420     // We need to clamp the VF to be the ConstTripCount. There is no point in
5421     // choosing a higher viable VF as done in the loop below.
5422     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5423                       << ConstTripCount << "\n");
5424     MaxVectorSize = ConstTripCount;
5425     return ElementCount::getFixed(MaxVectorSize);
5426   }
5427 
5428   unsigned MaxVF = MaxVectorSize;
5429   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5430       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5431     // Collect all viable vectorization factors larger than the default MaxVF
5432     // (i.e. MaxVectorSize).
5433     SmallVector<ElementCount, 8> VFs;
5434     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
5435     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
5436       VFs.push_back(ElementCount::getFixed(VS));
5437 
5438     // For each VF calculate its register usage.
5439     auto RUs = calculateRegisterUsage(VFs);
5440 
5441     // Select the largest VF which doesn't require more registers than existing
5442     // ones.
5443     for (int i = RUs.size() - 1; i >= 0; --i) {
5444       bool Selected = true;
5445       for (auto& pair : RUs[i].MaxLocalUsers) {
5446         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5447         if (pair.second > TargetNumRegisters)
5448           Selected = false;
5449       }
5450       if (Selected) {
5451         MaxVF = VFs[i].getKnownMinValue();
5452         break;
5453       }
5454     }
5455     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
5456       if (MaxVF < MinVF) {
5457         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5458                           << ") with target's minimum: " << MinVF << '\n');
5459         MaxVF = MinVF;
5460       }
5461     }
5462   }
5463   return ElementCount::getFixed(MaxVF);
5464 }
5465 
5466 VectorizationFactor
5467 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) {
5468   assert(!MaxVF.isScalable() && "scalable vectors not yet supported");
5469 
5470   float Cost = expectedCost(ElementCount::getFixed(1)).first;
5471   const float ScalarCost = Cost;
5472   unsigned Width = 1;
5473   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
5474 
5475   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5476   if (ForceVectorization && MaxVF.isVector()) {
5477     // Ignore scalar width, because the user explicitly wants vectorization.
5478     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5479     // evaluation.
5480     Cost = std::numeric_limits<float>::max();
5481   }
5482 
5483   for (unsigned i = 2; i <= MaxVF.getFixedValue(); i *= 2) {
5484     // Notice that the vector loop needs to be executed less times, so
5485     // we need to divide the cost of the vector loops by the width of
5486     // the vector elements.
5487     VectorizationCostTy C = expectedCost(ElementCount::getFixed(i));
5488     float VectorCost = C.first / (float)i;
5489     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5490                       << " costs: " << (int)VectorCost << ".\n");
5491     if (!C.second && !ForceVectorization) {
5492       LLVM_DEBUG(
5493           dbgs() << "LV: Not considering vector loop of width " << i
5494                  << " because it will not generate any vector instructions.\n");
5495       continue;
5496     }
5497     if (VectorCost < Cost) {
5498       Cost = VectorCost;
5499       Width = i;
5500     }
5501   }
5502 
5503   if (!EnableCondStoresVectorization && NumPredStores) {
5504     reportVectorizationFailure("There are conditional stores.",
5505         "store that is conditionally executed prevents vectorization",
5506         "ConditionalStore", ORE, TheLoop);
5507     Width = 1;
5508     Cost = ScalarCost;
5509   }
5510 
5511   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5512              << "LV: Vectorization seems to be not beneficial, "
5513              << "but was forced by a user.\n");
5514   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5515   VectorizationFactor Factor = {ElementCount::getFixed(Width),
5516                                 (unsigned)(Width * Cost)};
5517   return Factor;
5518 }
5519 
5520 std::pair<unsigned, unsigned>
5521 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5522   unsigned MinWidth = -1U;
5523   unsigned MaxWidth = 8;
5524   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5525 
5526   // For each block.
5527   for (BasicBlock *BB : TheLoop->blocks()) {
5528     // For each instruction in the loop.
5529     for (Instruction &I : BB->instructionsWithoutDebug()) {
5530       Type *T = I.getType();
5531 
5532       // Skip ignored values.
5533       if (ValuesToIgnore.count(&I))
5534         continue;
5535 
5536       // Only examine Loads, Stores and PHINodes.
5537       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5538         continue;
5539 
5540       // Examine PHI nodes that are reduction variables. Update the type to
5541       // account for the recurrence type.
5542       if (auto *PN = dyn_cast<PHINode>(&I)) {
5543         if (!Legal->isReductionVariable(PN))
5544           continue;
5545         RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
5546         T = RdxDesc.getRecurrenceType();
5547       }
5548 
5549       // Examine the stored values.
5550       if (auto *ST = dyn_cast<StoreInst>(&I))
5551         T = ST->getValueOperand()->getType();
5552 
5553       // Ignore loaded pointer types and stored pointer types that are not
5554       // vectorizable.
5555       //
5556       // FIXME: The check here attempts to predict whether a load or store will
5557       //        be vectorized. We only know this for certain after a VF has
5558       //        been selected. Here, we assume that if an access can be
5559       //        vectorized, it will be. We should also look at extending this
5560       //        optimization to non-pointer types.
5561       //
5562       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5563           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5564         continue;
5565 
5566       MinWidth = std::min(MinWidth,
5567                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5568       MaxWidth = std::max(MaxWidth,
5569                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5570     }
5571   }
5572 
5573   return {MinWidth, MaxWidth};
5574 }
5575 
5576 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5577                                                            unsigned LoopCost) {
5578   // -- The interleave heuristics --
5579   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5580   // There are many micro-architectural considerations that we can't predict
5581   // at this level. For example, frontend pressure (on decode or fetch) due to
5582   // code size, or the number and capabilities of the execution ports.
5583   //
5584   // We use the following heuristics to select the interleave count:
5585   // 1. If the code has reductions, then we interleave to break the cross
5586   // iteration dependency.
5587   // 2. If the loop is really small, then we interleave to reduce the loop
5588   // overhead.
5589   // 3. We don't interleave if we think that we will spill registers to memory
5590   // due to the increased register pressure.
5591 
5592   if (!isScalarEpilogueAllowed())
5593     return 1;
5594 
5595   // We used the distance for the interleave count.
5596   if (Legal->getMaxSafeDepDistBytes() != -1U)
5597     return 1;
5598 
5599   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5600   const bool HasReductions = !Legal->getReductionVars().empty();
5601   // Do not interleave loops with a relatively small known or estimated trip
5602   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
5603   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
5604   // because with the above conditions interleaving can expose ILP and break
5605   // cross iteration dependences for reductions.
5606   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
5607       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
5608     return 1;
5609 
5610   RegisterUsage R = calculateRegisterUsage({VF})[0];
5611   // We divide by these constants so assume that we have at least one
5612   // instruction that uses at least one register.
5613   for (auto& pair : R.MaxLocalUsers) {
5614     pair.second = std::max(pair.second, 1U);
5615   }
5616 
5617   // We calculate the interleave count using the following formula.
5618   // Subtract the number of loop invariants from the number of available
5619   // registers. These registers are used by all of the interleaved instances.
5620   // Next, divide the remaining registers by the number of registers that is
5621   // required by the loop, in order to estimate how many parallel instances
5622   // fit without causing spills. All of this is rounded down if necessary to be
5623   // a power of two. We want power of two interleave count to simplify any
5624   // addressing operations or alignment considerations.
5625   // We also want power of two interleave counts to ensure that the induction
5626   // variable of the vector loop wraps to zero, when tail is folded by masking;
5627   // this currently happens when OptForSize, in which case IC is set to 1 above.
5628   unsigned IC = UINT_MAX;
5629 
5630   for (auto& pair : R.MaxLocalUsers) {
5631     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5632     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5633                       << " registers of "
5634                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5635     if (VF.isScalar()) {
5636       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5637         TargetNumRegisters = ForceTargetNumScalarRegs;
5638     } else {
5639       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5640         TargetNumRegisters = ForceTargetNumVectorRegs;
5641     }
5642     unsigned MaxLocalUsers = pair.second;
5643     unsigned LoopInvariantRegs = 0;
5644     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5645       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5646 
5647     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5648     // Don't count the induction variable as interleaved.
5649     if (EnableIndVarRegisterHeur) {
5650       TmpIC =
5651           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5652                         std::max(1U, (MaxLocalUsers - 1)));
5653     }
5654 
5655     IC = std::min(IC, TmpIC);
5656   }
5657 
5658   // Clamp the interleave ranges to reasonable counts.
5659   assert(!VF.isScalable() && "scalable vectors not yet supported.");
5660   unsigned MaxInterleaveCount =
5661       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
5662 
5663   // Check if the user has overridden the max.
5664   if (VF.isScalar()) {
5665     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5666       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5667   } else {
5668     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5669       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5670   }
5671 
5672   // If trip count is known or estimated compile time constant, limit the
5673   // interleave count to be less than the trip count divided by VF, provided it
5674   // is at least 1.
5675   if (BestKnownTC) {
5676     MaxInterleaveCount =
5677         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
5678     // Make sure MaxInterleaveCount is greater than 0.
5679     MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
5680   }
5681 
5682   assert(MaxInterleaveCount > 0 &&
5683          "Maximum interleave count must be greater than 0");
5684 
5685   // Clamp the calculated IC to be between the 1 and the max interleave count
5686   // that the target and trip count allows.
5687   if (IC > MaxInterleaveCount)
5688     IC = MaxInterleaveCount;
5689   else
5690     // Make sure IC is greater than 0.
5691     IC = std::max(1u, IC);
5692 
5693   assert(IC > 0 && "Interleave count must be greater than 0.");
5694 
5695   // If we did not calculate the cost for VF (because the user selected the VF)
5696   // then we calculate the cost of VF here.
5697   if (LoopCost == 0)
5698     LoopCost = expectedCost(VF).first;
5699 
5700   assert(LoopCost && "Non-zero loop cost expected");
5701 
5702   // Interleave if we vectorized this loop and there is a reduction that could
5703   // benefit from interleaving.
5704   if (VF.isVector() && HasReductions) {
5705     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5706     return IC;
5707   }
5708 
5709   // Note that if we've already vectorized the loop we will have done the
5710   // runtime check and so interleaving won't require further checks.
5711   bool InterleavingRequiresRuntimePointerCheck =
5712       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
5713 
5714   // We want to interleave small loops in order to reduce the loop overhead and
5715   // potentially expose ILP opportunities.
5716   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5717                     << "LV: IC is " << IC << '\n'
5718                     << "LV: VF is " << VF.getKnownMinValue() << '\n');
5719   const bool AggressivelyInterleaveReductions =
5720       TTI.enableAggressiveInterleaving(HasReductions);
5721   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5722     // We assume that the cost overhead is 1 and we use the cost model
5723     // to estimate the cost of the loop and interleave until the cost of the
5724     // loop overhead is about 5% of the cost of the loop.
5725     unsigned SmallIC =
5726         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5727 
5728     // Interleave until store/load ports (estimated by max interleave count) are
5729     // saturated.
5730     unsigned NumStores = Legal->getNumStores();
5731     unsigned NumLoads = Legal->getNumLoads();
5732     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5733     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5734 
5735     // If we have a scalar reduction (vector reductions are already dealt with
5736     // by this point), we can increase the critical path length if the loop
5737     // we're interleaving is inside another loop. Limit, by default to 2, so the
5738     // critical path only gets increased by one reduction operation.
5739     if (HasReductions && TheLoop->getLoopDepth() > 1) {
5740       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5741       SmallIC = std::min(SmallIC, F);
5742       StoresIC = std::min(StoresIC, F);
5743       LoadsIC = std::min(LoadsIC, F);
5744     }
5745 
5746     if (EnableLoadStoreRuntimeInterleave &&
5747         std::max(StoresIC, LoadsIC) > SmallIC) {
5748       LLVM_DEBUG(
5749           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5750       return std::max(StoresIC, LoadsIC);
5751     }
5752 
5753     // If there are scalar reductions and TTI has enabled aggressive
5754     // interleaving for reductions, we will interleave to expose ILP.
5755     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
5756         AggressivelyInterleaveReductions) {
5757       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5758       // Interleave no less than SmallIC but not as aggressive as the normal IC
5759       // to satisfy the rare situation when resources are too limited.
5760       return std::max(IC / 2, SmallIC);
5761     } else {
5762       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5763       return SmallIC;
5764     }
5765   }
5766 
5767   // Interleave if this is a large loop (small loops are already dealt with by
5768   // this point) that could benefit from interleaving.
5769   if (AggressivelyInterleaveReductions) {
5770     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5771     return IC;
5772   }
5773 
5774   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5775   return 1;
5776 }
5777 
5778 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5779 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
5780   // This function calculates the register usage by measuring the highest number
5781   // of values that are alive at a single location. Obviously, this is a very
5782   // rough estimation. We scan the loop in a topological order in order and
5783   // assign a number to each instruction. We use RPO to ensure that defs are
5784   // met before their users. We assume that each instruction that has in-loop
5785   // users starts an interval. We record every time that an in-loop value is
5786   // used, so we have a list of the first and last occurrences of each
5787   // instruction. Next, we transpose this data structure into a multi map that
5788   // holds the list of intervals that *end* at a specific location. This multi
5789   // map allows us to perform a linear search. We scan the instructions linearly
5790   // and record each time that a new interval starts, by placing it in a set.
5791   // If we find this value in the multi-map then we remove it from the set.
5792   // The max register usage is the maximum size of the set.
5793   // We also search for instructions that are defined outside the loop, but are
5794   // used inside the loop. We need this number separately from the max-interval
5795   // usage number because when we unroll, loop-invariant values do not take
5796   // more register.
5797   LoopBlocksDFS DFS(TheLoop);
5798   DFS.perform(LI);
5799 
5800   RegisterUsage RU;
5801 
5802   // Each 'key' in the map opens a new interval. The values
5803   // of the map are the index of the 'last seen' usage of the
5804   // instruction that is the key.
5805   using IntervalMap = DenseMap<Instruction *, unsigned>;
5806 
5807   // Maps instruction to its index.
5808   SmallVector<Instruction *, 64> IdxToInstr;
5809   // Marks the end of each interval.
5810   IntervalMap EndPoint;
5811   // Saves the list of instruction indices that are used in the loop.
5812   SmallPtrSet<Instruction *, 8> Ends;
5813   // Saves the list of values that are used in the loop but are
5814   // defined outside the loop, such as arguments and constants.
5815   SmallPtrSet<Value *, 8> LoopInvariants;
5816 
5817   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5818     for (Instruction &I : BB->instructionsWithoutDebug()) {
5819       IdxToInstr.push_back(&I);
5820 
5821       // Save the end location of each USE.
5822       for (Value *U : I.operands()) {
5823         auto *Instr = dyn_cast<Instruction>(U);
5824 
5825         // Ignore non-instruction values such as arguments, constants, etc.
5826         if (!Instr)
5827           continue;
5828 
5829         // If this instruction is outside the loop then record it and continue.
5830         if (!TheLoop->contains(Instr)) {
5831           LoopInvariants.insert(Instr);
5832           continue;
5833         }
5834 
5835         // Overwrite previous end points.
5836         EndPoint[Instr] = IdxToInstr.size();
5837         Ends.insert(Instr);
5838       }
5839     }
5840   }
5841 
5842   // Saves the list of intervals that end with the index in 'key'.
5843   using InstrList = SmallVector<Instruction *, 2>;
5844   DenseMap<unsigned, InstrList> TransposeEnds;
5845 
5846   // Transpose the EndPoints to a list of values that end at each index.
5847   for (auto &Interval : EndPoint)
5848     TransposeEnds[Interval.second].push_back(Interval.first);
5849 
5850   SmallPtrSet<Instruction *, 8> OpenIntervals;
5851   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5852   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5853 
5854   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5855 
5856   // A lambda that gets the register usage for the given type and VF.
5857   const auto &TTICapture = TTI;
5858   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) {
5859     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
5860       return 0U;
5861     return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
5862   };
5863 
5864   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5865     Instruction *I = IdxToInstr[i];
5866 
5867     // Remove all of the instructions that end at this location.
5868     InstrList &List = TransposeEnds[i];
5869     for (Instruction *ToRemove : List)
5870       OpenIntervals.erase(ToRemove);
5871 
5872     // Ignore instructions that are never used within the loop.
5873     if (!Ends.count(I))
5874       continue;
5875 
5876     // Skip ignored values.
5877     if (ValuesToIgnore.count(I))
5878       continue;
5879 
5880     // For each VF find the maximum usage of registers.
5881     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5882       // Count the number of live intervals.
5883       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5884 
5885       if (VFs[j].isScalar()) {
5886         for (auto Inst : OpenIntervals) {
5887           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5888           if (RegUsage.find(ClassID) == RegUsage.end())
5889             RegUsage[ClassID] = 1;
5890           else
5891             RegUsage[ClassID] += 1;
5892         }
5893       } else {
5894         collectUniformsAndScalars(VFs[j]);
5895         for (auto Inst : OpenIntervals) {
5896           // Skip ignored values for VF > 1.
5897           if (VecValuesToIgnore.count(Inst))
5898             continue;
5899           if (isScalarAfterVectorization(Inst, VFs[j])) {
5900             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5901             if (RegUsage.find(ClassID) == RegUsage.end())
5902               RegUsage[ClassID] = 1;
5903             else
5904               RegUsage[ClassID] += 1;
5905           } else {
5906             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
5907             if (RegUsage.find(ClassID) == RegUsage.end())
5908               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
5909             else
5910               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5911           }
5912         }
5913       }
5914 
5915       for (auto& pair : RegUsage) {
5916         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
5917           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
5918         else
5919           MaxUsages[j][pair.first] = pair.second;
5920       }
5921     }
5922 
5923     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5924                       << OpenIntervals.size() << '\n');
5925 
5926     // Add the current instruction to the list of open intervals.
5927     OpenIntervals.insert(I);
5928   }
5929 
5930   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5931     SmallMapVector<unsigned, unsigned, 4> Invariant;
5932 
5933     for (auto Inst : LoopInvariants) {
5934       unsigned Usage =
5935           VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
5936       unsigned ClassID =
5937           TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
5938       if (Invariant.find(ClassID) == Invariant.end())
5939         Invariant[ClassID] = Usage;
5940       else
5941         Invariant[ClassID] += Usage;
5942     }
5943 
5944     LLVM_DEBUG({
5945       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5946       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5947              << " item\n";
5948       for (const auto &pair : MaxUsages[i]) {
5949         dbgs() << "LV(REG): RegisterClass: "
5950                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5951                << " registers\n";
5952       }
5953       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5954              << " item\n";
5955       for (const auto &pair : Invariant) {
5956         dbgs() << "LV(REG): RegisterClass: "
5957                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5958                << " registers\n";
5959       }
5960     });
5961 
5962     RU.LoopInvariantRegs = Invariant;
5963     RU.MaxLocalUsers = MaxUsages[i];
5964     RUs[i] = RU;
5965   }
5966 
5967   return RUs;
5968 }
5969 
5970 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5971   // TODO: Cost model for emulated masked load/store is completely
5972   // broken. This hack guides the cost model to use an artificially
5973   // high enough value to practically disable vectorization with such
5974   // operations, except where previously deployed legality hack allowed
5975   // using very low cost values. This is to avoid regressions coming simply
5976   // from moving "masked load/store" check from legality to cost model.
5977   // Masked Load/Gather emulation was previously never allowed.
5978   // Limited number of Masked Store/Scatter emulation was allowed.
5979   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
5980   return isa<LoadInst>(I) ||
5981          (isa<StoreInst>(I) &&
5982           NumPredStores > NumberOfStoresToPredicate);
5983 }
5984 
5985 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
5986   // If we aren't vectorizing the loop, or if we've already collected the
5987   // instructions to scalarize, there's nothing to do. Collection may already
5988   // have occurred if we have a user-selected VF and are now computing the
5989   // expected cost for interleaving.
5990   if (VF.isScalar() || VF.isZero() ||
5991       InstsToScalarize.find(VF) != InstsToScalarize.end())
5992     return;
5993 
5994   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5995   // not profitable to scalarize any instructions, the presence of VF in the
5996   // map will indicate that we've analyzed it already.
5997   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5998 
5999   // Find all the instructions that are scalar with predication in the loop and
6000   // determine if it would be better to not if-convert the blocks they are in.
6001   // If so, we also record the instructions to scalarize.
6002   for (BasicBlock *BB : TheLoop->blocks()) {
6003     if (!blockNeedsPredication(BB))
6004       continue;
6005     for (Instruction &I : *BB)
6006       if (isScalarWithPredication(&I)) {
6007         ScalarCostsTy ScalarCosts;
6008         // Do not apply discount logic if hacked cost is needed
6009         // for emulated masked memrefs.
6010         if (!useEmulatedMaskMemRefHack(&I) &&
6011             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6012           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6013         // Remember that BB will remain after vectorization.
6014         PredicatedBBsAfterVectorization.insert(BB);
6015       }
6016   }
6017 }
6018 
6019 int LoopVectorizationCostModel::computePredInstDiscount(
6020     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
6021     ElementCount VF) {
6022   assert(!isUniformAfterVectorization(PredInst, VF) &&
6023          "Instruction marked uniform-after-vectorization will be predicated");
6024 
6025   // Initialize the discount to zero, meaning that the scalar version and the
6026   // vector version cost the same.
6027   int Discount = 0;
6028 
6029   // Holds instructions to analyze. The instructions we visit are mapped in
6030   // ScalarCosts. Those instructions are the ones that would be scalarized if
6031   // we find that the scalar version costs less.
6032   SmallVector<Instruction *, 8> Worklist;
6033 
6034   // Returns true if the given instruction can be scalarized.
6035   auto canBeScalarized = [&](Instruction *I) -> bool {
6036     // We only attempt to scalarize instructions forming a single-use chain
6037     // from the original predicated block that would otherwise be vectorized.
6038     // Although not strictly necessary, we give up on instructions we know will
6039     // already be scalar to avoid traversing chains that are unlikely to be
6040     // beneficial.
6041     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6042         isScalarAfterVectorization(I, VF))
6043       return false;
6044 
6045     // If the instruction is scalar with predication, it will be analyzed
6046     // separately. We ignore it within the context of PredInst.
6047     if (isScalarWithPredication(I))
6048       return false;
6049 
6050     // If any of the instruction's operands are uniform after vectorization,
6051     // the instruction cannot be scalarized. This prevents, for example, a
6052     // masked load from being scalarized.
6053     //
6054     // We assume we will only emit a value for lane zero of an instruction
6055     // marked uniform after vectorization, rather than VF identical values.
6056     // Thus, if we scalarize an instruction that uses a uniform, we would
6057     // create uses of values corresponding to the lanes we aren't emitting code
6058     // for. This behavior can be changed by allowing getScalarValue to clone
6059     // the lane zero values for uniforms rather than asserting.
6060     for (Use &U : I->operands())
6061       if (auto *J = dyn_cast<Instruction>(U.get()))
6062         if (isUniformAfterVectorization(J, VF))
6063           return false;
6064 
6065     // Otherwise, we can scalarize the instruction.
6066     return true;
6067   };
6068 
6069   // Compute the expected cost discount from scalarizing the entire expression
6070   // feeding the predicated instruction. We currently only consider expressions
6071   // that are single-use instruction chains.
6072   Worklist.push_back(PredInst);
6073   while (!Worklist.empty()) {
6074     Instruction *I = Worklist.pop_back_val();
6075 
6076     // If we've already analyzed the instruction, there's nothing to do.
6077     if (ScalarCosts.find(I) != ScalarCosts.end())
6078       continue;
6079 
6080     // Compute the cost of the vector instruction. Note that this cost already
6081     // includes the scalarization overhead of the predicated instruction.
6082     unsigned VectorCost = getInstructionCost(I, VF).first;
6083 
6084     // Compute the cost of the scalarized instruction. This cost is the cost of
6085     // the instruction as if it wasn't if-converted and instead remained in the
6086     // predicated block. We will scale this cost by block probability after
6087     // computing the scalarization overhead.
6088     assert(!VF.isScalable() && "scalable vectors not yet supported.");
6089     unsigned ScalarCost =
6090         VF.getKnownMinValue() *
6091         getInstructionCost(I, ElementCount::getFixed(1)).first;
6092 
6093     // Compute the scalarization overhead of needed insertelement instructions
6094     // and phi nodes.
6095     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
6096       ScalarCost += TTI.getScalarizationOverhead(
6097           cast<VectorType>(ToVectorTy(I->getType(), VF)),
6098           APInt::getAllOnesValue(VF.getKnownMinValue()), true, false);
6099       assert(!VF.isScalable() && "scalable vectors not yet supported.");
6100       ScalarCost +=
6101           VF.getKnownMinValue() *
6102           TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
6103     }
6104 
6105     // Compute the scalarization overhead of needed extractelement
6106     // instructions. For each of the instruction's operands, if the operand can
6107     // be scalarized, add it to the worklist; otherwise, account for the
6108     // overhead.
6109     for (Use &U : I->operands())
6110       if (auto *J = dyn_cast<Instruction>(U.get())) {
6111         assert(VectorType::isValidElementType(J->getType()) &&
6112                "Instruction has non-scalar type");
6113         if (canBeScalarized(J))
6114           Worklist.push_back(J);
6115         else if (needsExtract(J, VF)) {
6116           assert(!VF.isScalable() && "scalable vectors not yet supported.");
6117           ScalarCost += TTI.getScalarizationOverhead(
6118               cast<VectorType>(ToVectorTy(J->getType(), VF)),
6119               APInt::getAllOnesValue(VF.getKnownMinValue()), false, true);
6120         }
6121       }
6122 
6123     // Scale the total scalar cost by block probability.
6124     ScalarCost /= getReciprocalPredBlockProb();
6125 
6126     // Compute the discount. A non-negative discount means the vector version
6127     // of the instruction costs more, and scalarizing would be beneficial.
6128     Discount += VectorCost - ScalarCost;
6129     ScalarCosts[I] = ScalarCost;
6130   }
6131 
6132   return Discount;
6133 }
6134 
6135 LoopVectorizationCostModel::VectorizationCostTy
6136 LoopVectorizationCostModel::expectedCost(ElementCount VF) {
6137   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6138   VectorizationCostTy Cost;
6139 
6140   // For each block.
6141   for (BasicBlock *BB : TheLoop->blocks()) {
6142     VectorizationCostTy BlockCost;
6143 
6144     // For each instruction in the old loop.
6145     for (Instruction &I : BB->instructionsWithoutDebug()) {
6146       // Skip ignored values.
6147       if (ValuesToIgnore.count(&I) ||
6148           (VF.isVector() && VecValuesToIgnore.count(&I)))
6149         continue;
6150 
6151       VectorizationCostTy C = getInstructionCost(&I, VF);
6152 
6153       // Check if we should override the cost.
6154       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
6155         C.first = ForceTargetInstructionCost;
6156 
6157       BlockCost.first += C.first;
6158       BlockCost.second |= C.second;
6159       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6160                         << " for VF " << VF << " For instruction: " << I
6161                         << '\n');
6162     }
6163 
6164     // If we are vectorizing a predicated block, it will have been
6165     // if-converted. This means that the block's instructions (aside from
6166     // stores and instructions that may divide by zero) will now be
6167     // unconditionally executed. For the scalar case, we may not always execute
6168     // the predicated block. Thus, scale the block's cost by the probability of
6169     // executing it.
6170     if (VF.isScalar() && blockNeedsPredication(BB))
6171       BlockCost.first /= getReciprocalPredBlockProb();
6172 
6173     Cost.first += BlockCost.first;
6174     Cost.second |= BlockCost.second;
6175   }
6176 
6177   return Cost;
6178 }
6179 
6180 /// Gets Address Access SCEV after verifying that the access pattern
6181 /// is loop invariant except the induction variable dependence.
6182 ///
6183 /// This SCEV can be sent to the Target in order to estimate the address
6184 /// calculation cost.
6185 static const SCEV *getAddressAccessSCEV(
6186               Value *Ptr,
6187               LoopVectorizationLegality *Legal,
6188               PredicatedScalarEvolution &PSE,
6189               const Loop *TheLoop) {
6190 
6191   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6192   if (!Gep)
6193     return nullptr;
6194 
6195   // We are looking for a gep with all loop invariant indices except for one
6196   // which should be an induction variable.
6197   auto SE = PSE.getSE();
6198   unsigned NumOperands = Gep->getNumOperands();
6199   for (unsigned i = 1; i < NumOperands; ++i) {
6200     Value *Opd = Gep->getOperand(i);
6201     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6202         !Legal->isInductionVariable(Opd))
6203       return nullptr;
6204   }
6205 
6206   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6207   return PSE.getSCEV(Ptr);
6208 }
6209 
6210 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6211   return Legal->hasStride(I->getOperand(0)) ||
6212          Legal->hasStride(I->getOperand(1));
6213 }
6214 
6215 unsigned
6216 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6217                                                         ElementCount VF) {
6218   assert(VF.isVector() &&
6219          "Scalarization cost of instruction implies vectorization.");
6220   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6221   Type *ValTy = getMemInstValueType(I);
6222   auto SE = PSE.getSE();
6223 
6224   unsigned AS = getLoadStoreAddressSpace(I);
6225   Value *Ptr = getLoadStorePointerOperand(I);
6226   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6227 
6228   // Figure out whether the access is strided and get the stride value
6229   // if it's known in compile time
6230   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6231 
6232   // Get the cost of the scalar memory instruction and address computation.
6233   unsigned Cost =
6234       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6235 
6236   // Don't pass *I here, since it is scalar but will actually be part of a
6237   // vectorized loop where the user of it is a vectorized instruction.
6238   const Align Alignment = getLoadStoreAlignment(I);
6239   Cost += VF.getKnownMinValue() *
6240           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
6241                               AS, TTI::TCK_RecipThroughput);
6242 
6243   // Get the overhead of the extractelement and insertelement instructions
6244   // we might create due to scalarization.
6245   Cost += getScalarizationOverhead(I, VF);
6246 
6247   // If we have a predicated store, it may not be executed for each vector
6248   // lane. Scale the cost by the probability of executing the predicated
6249   // block.
6250   if (isPredicatedInst(I)) {
6251     Cost /= getReciprocalPredBlockProb();
6252 
6253     if (useEmulatedMaskMemRefHack(I))
6254       // Artificially setting to a high enough value to practically disable
6255       // vectorization with such operations.
6256       Cost = 3000000;
6257   }
6258 
6259   return Cost;
6260 }
6261 
6262 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6263                                                              ElementCount VF) {
6264   Type *ValTy = getMemInstValueType(I);
6265   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6266   Value *Ptr = getLoadStorePointerOperand(I);
6267   unsigned AS = getLoadStoreAddressSpace(I);
6268   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
6269   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6270 
6271   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6272          "Stride should be 1 or -1 for consecutive memory access");
6273   const Align Alignment = getLoadStoreAlignment(I);
6274   unsigned Cost = 0;
6275   if (Legal->isMaskRequired(I))
6276     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6277                                       CostKind);
6278   else
6279     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6280                                 CostKind, I);
6281 
6282   bool Reverse = ConsecutiveStride < 0;
6283   if (Reverse)
6284     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6285   return Cost;
6286 }
6287 
6288 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6289                                                          ElementCount VF) {
6290   assert(Legal->isUniformMemOp(*I));
6291 
6292   Type *ValTy = getMemInstValueType(I);
6293   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6294   const Align Alignment = getLoadStoreAlignment(I);
6295   unsigned AS = getLoadStoreAddressSpace(I);
6296   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6297   if (isa<LoadInst>(I)) {
6298     return TTI.getAddressComputationCost(ValTy) +
6299            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6300                                CostKind) +
6301            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6302   }
6303   StoreInst *SI = cast<StoreInst>(I);
6304 
6305   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6306   return TTI.getAddressComputationCost(ValTy) +
6307          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6308                              CostKind) +
6309          (isLoopInvariantStoreValue
6310               ? 0
6311               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6312                                        VF.getKnownMinValue() - 1));
6313 }
6314 
6315 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6316                                                           ElementCount VF) {
6317   Type *ValTy = getMemInstValueType(I);
6318   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6319   const Align Alignment = getLoadStoreAlignment(I);
6320   const Value *Ptr = getLoadStorePointerOperand(I);
6321 
6322   return TTI.getAddressComputationCost(VectorTy) +
6323          TTI.getGatherScatterOpCost(
6324              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6325              TargetTransformInfo::TCK_RecipThroughput, I);
6326 }
6327 
6328 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6329                                                             ElementCount VF) {
6330   Type *ValTy = getMemInstValueType(I);
6331   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6332   unsigned AS = getLoadStoreAddressSpace(I);
6333 
6334   auto Group = getInterleavedAccessGroup(I);
6335   assert(Group && "Fail to get an interleaved access group.");
6336 
6337   unsigned InterleaveFactor = Group->getFactor();
6338   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6339   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6340 
6341   // Holds the indices of existing members in an interleaved load group.
6342   // An interleaved store group doesn't need this as it doesn't allow gaps.
6343   SmallVector<unsigned, 4> Indices;
6344   if (isa<LoadInst>(I)) {
6345     for (unsigned i = 0; i < InterleaveFactor; i++)
6346       if (Group->getMember(i))
6347         Indices.push_back(i);
6348   }
6349 
6350   // Calculate the cost of the whole interleaved group.
6351   bool UseMaskForGaps =
6352       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
6353   unsigned Cost = TTI.getInterleavedMemoryOpCost(
6354       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6355       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
6356 
6357   if (Group->isReverse()) {
6358     // TODO: Add support for reversed masked interleaved access.
6359     assert(!Legal->isMaskRequired(I) &&
6360            "Reverse masked interleaved access not supported.");
6361     Cost += Group->getNumMembers() *
6362             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6363   }
6364   return Cost;
6365 }
6366 
6367 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6368                                                               ElementCount VF) {
6369   // Calculate scalar cost only. Vectorization cost should be ready at this
6370   // moment.
6371   if (VF.isScalar()) {
6372     Type *ValTy = getMemInstValueType(I);
6373     const Align Alignment = getLoadStoreAlignment(I);
6374     unsigned AS = getLoadStoreAddressSpace(I);
6375 
6376     return TTI.getAddressComputationCost(ValTy) +
6377            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6378                                TTI::TCK_RecipThroughput, I);
6379   }
6380   return getWideningCost(I, VF);
6381 }
6382 
6383 LoopVectorizationCostModel::VectorizationCostTy
6384 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6385                                                ElementCount VF) {
6386   assert(!VF.isScalable() &&
6387          "the cost model is not yet implemented for scalable vectorization");
6388   // If we know that this instruction will remain uniform, check the cost of
6389   // the scalar version.
6390   if (isUniformAfterVectorization(I, VF))
6391     VF = ElementCount::getFixed(1);
6392 
6393   if (VF.isVector() && isProfitableToScalarize(I, VF))
6394     return VectorizationCostTy(InstsToScalarize[VF][I], false);
6395 
6396   // Forced scalars do not have any scalarization overhead.
6397   auto ForcedScalar = ForcedScalars.find(VF);
6398   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6399     auto InstSet = ForcedScalar->second;
6400     if (InstSet.count(I))
6401       return VectorizationCostTy(
6402           (getInstructionCost(I, ElementCount::getFixed(1)).first *
6403            VF.getKnownMinValue()),
6404           false);
6405   }
6406 
6407   Type *VectorTy;
6408   unsigned C = getInstructionCost(I, VF, VectorTy);
6409 
6410   bool TypeNotScalarized =
6411       VF.isVector() && VectorTy->isVectorTy() &&
6412       TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue();
6413   return VectorizationCostTy(C, TypeNotScalarized);
6414 }
6415 
6416 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
6417                                                               ElementCount VF) {
6418 
6419   assert(!VF.isScalable() &&
6420          "cannot compute scalarization overhead for scalable vectorization");
6421   if (VF.isScalar())
6422     return 0;
6423 
6424   unsigned Cost = 0;
6425   Type *RetTy = ToVectorTy(I->getType(), VF);
6426   if (!RetTy->isVoidTy() &&
6427       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6428     Cost += TTI.getScalarizationOverhead(
6429         cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()),
6430         true, false);
6431 
6432   // Some targets keep addresses scalar.
6433   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6434     return Cost;
6435 
6436   // Some targets support efficient element stores.
6437   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6438     return Cost;
6439 
6440   // Collect operands to consider.
6441   CallInst *CI = dyn_cast<CallInst>(I);
6442   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
6443 
6444   // Skip operands that do not require extraction/scalarization and do not incur
6445   // any overhead.
6446   return Cost + TTI.getOperandsScalarizationOverhead(
6447                     filterExtractingOperands(Ops, VF), VF.getKnownMinValue());
6448 }
6449 
6450 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6451   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6452   if (VF.isScalar())
6453     return;
6454   NumPredStores = 0;
6455   for (BasicBlock *BB : TheLoop->blocks()) {
6456     // For each instruction in the old loop.
6457     for (Instruction &I : *BB) {
6458       Value *Ptr =  getLoadStorePointerOperand(&I);
6459       if (!Ptr)
6460         continue;
6461 
6462       // TODO: We should generate better code and update the cost model for
6463       // predicated uniform stores. Today they are treated as any other
6464       // predicated store (see added test cases in
6465       // invariant-store-vectorization.ll).
6466       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
6467         NumPredStores++;
6468 
6469       if (Legal->isUniformMemOp(I)) {
6470         // TODO: Avoid replicating loads and stores instead of
6471         // relying on instcombine to remove them.
6472         // Load: Scalar load + broadcast
6473         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6474         unsigned Cost = getUniformMemOpCost(&I, VF);
6475         setWideningDecision(&I, VF, CM_Scalarize, Cost);
6476         continue;
6477       }
6478 
6479       // We assume that widening is the best solution when possible.
6480       if (memoryInstructionCanBeWidened(&I, VF)) {
6481         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
6482         int ConsecutiveStride =
6483                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
6484         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6485                "Expected consecutive stride.");
6486         InstWidening Decision =
6487             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6488         setWideningDecision(&I, VF, Decision, Cost);
6489         continue;
6490       }
6491 
6492       // Choose between Interleaving, Gather/Scatter or Scalarization.
6493       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
6494       unsigned NumAccesses = 1;
6495       if (isAccessInterleaved(&I)) {
6496         auto Group = getInterleavedAccessGroup(&I);
6497         assert(Group && "Fail to get an interleaved access group.");
6498 
6499         // Make one decision for the whole group.
6500         if (getWideningDecision(&I, VF) != CM_Unknown)
6501           continue;
6502 
6503         NumAccesses = Group->getNumMembers();
6504         if (interleavedAccessCanBeWidened(&I, VF))
6505           InterleaveCost = getInterleaveGroupCost(&I, VF);
6506       }
6507 
6508       unsigned GatherScatterCost =
6509           isLegalGatherOrScatter(&I)
6510               ? getGatherScatterCost(&I, VF) * NumAccesses
6511               : std::numeric_limits<unsigned>::max();
6512 
6513       unsigned ScalarizationCost =
6514           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6515 
6516       // Choose better solution for the current VF,
6517       // write down this decision and use it during vectorization.
6518       unsigned Cost;
6519       InstWidening Decision;
6520       if (InterleaveCost <= GatherScatterCost &&
6521           InterleaveCost < ScalarizationCost) {
6522         Decision = CM_Interleave;
6523         Cost = InterleaveCost;
6524       } else if (GatherScatterCost < ScalarizationCost) {
6525         Decision = CM_GatherScatter;
6526         Cost = GatherScatterCost;
6527       } else {
6528         Decision = CM_Scalarize;
6529         Cost = ScalarizationCost;
6530       }
6531       // If the instructions belongs to an interleave group, the whole group
6532       // receives the same decision. The whole group receives the cost, but
6533       // the cost will actually be assigned to one instruction.
6534       if (auto Group = getInterleavedAccessGroup(&I))
6535         setWideningDecision(Group, VF, Decision, Cost);
6536       else
6537         setWideningDecision(&I, VF, Decision, Cost);
6538     }
6539   }
6540 
6541   // Make sure that any load of address and any other address computation
6542   // remains scalar unless there is gather/scatter support. This avoids
6543   // inevitable extracts into address registers, and also has the benefit of
6544   // activating LSR more, since that pass can't optimize vectorized
6545   // addresses.
6546   if (TTI.prefersVectorizedAddressing())
6547     return;
6548 
6549   // Start with all scalar pointer uses.
6550   SmallPtrSet<Instruction *, 8> AddrDefs;
6551   for (BasicBlock *BB : TheLoop->blocks())
6552     for (Instruction &I : *BB) {
6553       Instruction *PtrDef =
6554         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6555       if (PtrDef && TheLoop->contains(PtrDef) &&
6556           getWideningDecision(&I, VF) != CM_GatherScatter)
6557         AddrDefs.insert(PtrDef);
6558     }
6559 
6560   // Add all instructions used to generate the addresses.
6561   SmallVector<Instruction *, 4> Worklist;
6562   for (auto *I : AddrDefs)
6563     Worklist.push_back(I);
6564   while (!Worklist.empty()) {
6565     Instruction *I = Worklist.pop_back_val();
6566     for (auto &Op : I->operands())
6567       if (auto *InstOp = dyn_cast<Instruction>(Op))
6568         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6569             AddrDefs.insert(InstOp).second)
6570           Worklist.push_back(InstOp);
6571   }
6572 
6573   for (auto *I : AddrDefs) {
6574     if (isa<LoadInst>(I)) {
6575       // Setting the desired widening decision should ideally be handled in
6576       // by cost functions, but since this involves the task of finding out
6577       // if the loaded register is involved in an address computation, it is
6578       // instead changed here when we know this is the case.
6579       InstWidening Decision = getWideningDecision(I, VF);
6580       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6581         // Scalarize a widened load of address.
6582         setWideningDecision(
6583             I, VF, CM_Scalarize,
6584             (VF.getKnownMinValue() *
6585              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6586       else if (auto Group = getInterleavedAccessGroup(I)) {
6587         // Scalarize an interleave group of address loads.
6588         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6589           if (Instruction *Member = Group->getMember(I))
6590             setWideningDecision(
6591                 Member, VF, CM_Scalarize,
6592                 (VF.getKnownMinValue() *
6593                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6594         }
6595       }
6596     } else
6597       // Make sure I gets scalarized and a cost estimate without
6598       // scalarization overhead.
6599       ForcedScalars[VF].insert(I);
6600   }
6601 }
6602 
6603 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6604                                                         ElementCount VF,
6605                                                         Type *&VectorTy) {
6606   Type *RetTy = I->getType();
6607   if (canTruncateToMinimalBitwidth(I, VF))
6608     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6609   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
6610   auto SE = PSE.getSE();
6611   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6612 
6613   // TODO: We need to estimate the cost of intrinsic calls.
6614   switch (I->getOpcode()) {
6615   case Instruction::GetElementPtr:
6616     // We mark this instruction as zero-cost because the cost of GEPs in
6617     // vectorized code depends on whether the corresponding memory instruction
6618     // is scalarized or not. Therefore, we handle GEPs with the memory
6619     // instruction cost.
6620     return 0;
6621   case Instruction::Br: {
6622     // In cases of scalarized and predicated instructions, there will be VF
6623     // predicated blocks in the vectorized loop. Each branch around these
6624     // blocks requires also an extract of its vector compare i1 element.
6625     bool ScalarPredicatedBB = false;
6626     BranchInst *BI = cast<BranchInst>(I);
6627     if (VF.isVector() && BI->isConditional() &&
6628         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
6629          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
6630       ScalarPredicatedBB = true;
6631 
6632     if (ScalarPredicatedBB) {
6633       // Return cost for branches around scalarized and predicated blocks.
6634       assert(!VF.isScalable() && "scalable vectors not yet supported.");
6635       auto *Vec_i1Ty =
6636           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6637       return (TTI.getScalarizationOverhead(
6638                   Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()),
6639                   false, true) +
6640               (TTI.getCFInstrCost(Instruction::Br, CostKind) *
6641                VF.getKnownMinValue()));
6642     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6643       // The back-edge branch will remain, as will all scalar branches.
6644       return TTI.getCFInstrCost(Instruction::Br, CostKind);
6645     else
6646       // This branch will be eliminated by if-conversion.
6647       return 0;
6648     // Note: We currently assume zero cost for an unconditional branch inside
6649     // a predicated block since it will become a fall-through, although we
6650     // may decide in the future to call TTI for all branches.
6651   }
6652   case Instruction::PHI: {
6653     auto *Phi = cast<PHINode>(I);
6654 
6655     // First-order recurrences are replaced by vector shuffles inside the loop.
6656     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6657     if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
6658       return TTI.getShuffleCost(
6659           TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
6660           VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
6661 
6662     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6663     // converted into select instructions. We require N - 1 selects per phi
6664     // node, where N is the number of incoming values.
6665     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
6666       return (Phi->getNumIncomingValues() - 1) *
6667              TTI.getCmpSelInstrCost(
6668                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6669                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6670                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
6671 
6672     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6673   }
6674   case Instruction::UDiv:
6675   case Instruction::SDiv:
6676   case Instruction::URem:
6677   case Instruction::SRem:
6678     // If we have a predicated instruction, it may not be executed for each
6679     // vector lane. Get the scalarization cost and scale this amount by the
6680     // probability of executing the predicated block. If the instruction is not
6681     // predicated, we fall through to the next case.
6682     if (VF.isVector() && isScalarWithPredication(I)) {
6683       unsigned Cost = 0;
6684 
6685       // These instructions have a non-void type, so account for the phi nodes
6686       // that we will create. This cost is likely to be zero. The phi node
6687       // cost, if any, should be scaled by the block probability because it
6688       // models a copy at the end of each predicated block.
6689       Cost += VF.getKnownMinValue() *
6690               TTI.getCFInstrCost(Instruction::PHI, CostKind);
6691 
6692       // The cost of the non-predicated instruction.
6693       Cost += VF.getKnownMinValue() *
6694               TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
6695 
6696       // The cost of insertelement and extractelement instructions needed for
6697       // scalarization.
6698       Cost += getScalarizationOverhead(I, VF);
6699 
6700       // Scale the cost by the probability of executing the predicated blocks.
6701       // This assumes the predicated block for each vector lane is equally
6702       // likely.
6703       return Cost / getReciprocalPredBlockProb();
6704     }
6705     LLVM_FALLTHROUGH;
6706   case Instruction::Add:
6707   case Instruction::FAdd:
6708   case Instruction::Sub:
6709   case Instruction::FSub:
6710   case Instruction::Mul:
6711   case Instruction::FMul:
6712   case Instruction::FDiv:
6713   case Instruction::FRem:
6714   case Instruction::Shl:
6715   case Instruction::LShr:
6716   case Instruction::AShr:
6717   case Instruction::And:
6718   case Instruction::Or:
6719   case Instruction::Xor: {
6720     // Since we will replace the stride by 1 the multiplication should go away.
6721     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
6722       return 0;
6723     // Certain instructions can be cheaper to vectorize if they have a constant
6724     // second vector operand. One example of this are shifts on x86.
6725     Value *Op2 = I->getOperand(1);
6726     TargetTransformInfo::OperandValueProperties Op2VP;
6727     TargetTransformInfo::OperandValueKind Op2VK =
6728         TTI.getOperandInfo(Op2, Op2VP);
6729     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
6730       Op2VK = TargetTransformInfo::OK_UniformValue;
6731 
6732     SmallVector<const Value *, 4> Operands(I->operand_values());
6733     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
6734     return N * TTI.getArithmeticInstrCost(
6735                    I->getOpcode(), VectorTy, CostKind,
6736                    TargetTransformInfo::OK_AnyValue,
6737                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
6738   }
6739   case Instruction::FNeg: {
6740     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
6741     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
6742     return N * TTI.getArithmeticInstrCost(
6743                    I->getOpcode(), VectorTy, CostKind,
6744                    TargetTransformInfo::OK_AnyValue,
6745                    TargetTransformInfo::OK_AnyValue,
6746                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
6747                    I->getOperand(0), I);
6748   }
6749   case Instruction::Select: {
6750     SelectInst *SI = cast<SelectInst>(I);
6751     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6752     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6753     Type *CondTy = SI->getCondition()->getType();
6754     if (!ScalarCond) {
6755       assert(!VF.isScalable() && "VF is assumed to be non scalable.");
6756       CondTy = VectorType::get(CondTy, VF);
6757     }
6758     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
6759                                   CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
6760   }
6761   case Instruction::ICmp:
6762   case Instruction::FCmp: {
6763     Type *ValTy = I->getOperand(0)->getType();
6764     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6765     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6766       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6767     VectorTy = ToVectorTy(ValTy, VF);
6768     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
6769                                   CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
6770   }
6771   case Instruction::Store:
6772   case Instruction::Load: {
6773     ElementCount Width = VF;
6774     if (Width.isVector()) {
6775       InstWidening Decision = getWideningDecision(I, Width);
6776       assert(Decision != CM_Unknown &&
6777              "CM decision should be taken at this point");
6778       if (Decision == CM_Scalarize)
6779         Width = ElementCount::getFixed(1);
6780     }
6781     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
6782     return getMemoryInstructionCost(I, VF);
6783   }
6784   case Instruction::ZExt:
6785   case Instruction::SExt:
6786   case Instruction::FPToUI:
6787   case Instruction::FPToSI:
6788   case Instruction::FPExt:
6789   case Instruction::PtrToInt:
6790   case Instruction::IntToPtr:
6791   case Instruction::SIToFP:
6792   case Instruction::UIToFP:
6793   case Instruction::Trunc:
6794   case Instruction::FPTrunc:
6795   case Instruction::BitCast: {
6796     // Computes the CastContextHint from a Load/Store instruction.
6797     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6798       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
6799              "Expected a load or a store!");
6800 
6801       if (VF.isScalar() || !TheLoop->contains(I))
6802         return TTI::CastContextHint::Normal;
6803 
6804       switch (getWideningDecision(I, VF)) {
6805       case LoopVectorizationCostModel::CM_GatherScatter:
6806         return TTI::CastContextHint::GatherScatter;
6807       case LoopVectorizationCostModel::CM_Interleave:
6808         return TTI::CastContextHint::Interleave;
6809       case LoopVectorizationCostModel::CM_Scalarize:
6810       case LoopVectorizationCostModel::CM_Widen:
6811         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
6812                                         : TTI::CastContextHint::Normal;
6813       case LoopVectorizationCostModel::CM_Widen_Reverse:
6814         return TTI::CastContextHint::Reversed;
6815       case LoopVectorizationCostModel::CM_Unknown:
6816         llvm_unreachable("Instr did not go through cost modelling?");
6817       }
6818 
6819       llvm_unreachable("Unhandled case!");
6820     };
6821 
6822     unsigned Opcode = I->getOpcode();
6823     TTI::CastContextHint CCH = TTI::CastContextHint::None;
6824     // For Trunc, the context is the only user, which must be a StoreInst.
6825     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
6826       if (I->hasOneUse())
6827         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
6828           CCH = ComputeCCH(Store);
6829     }
6830     // For Z/Sext, the context is the operand, which must be a LoadInst.
6831     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
6832              Opcode == Instruction::FPExt) {
6833       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
6834         CCH = ComputeCCH(Load);
6835     }
6836 
6837     // We optimize the truncation of induction variables having constant
6838     // integer steps. The cost of these truncations is the same as the scalar
6839     // operation.
6840     if (isOptimizableIVTruncate(I, VF)) {
6841       auto *Trunc = cast<TruncInst>(I);
6842       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6843                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
6844     }
6845 
6846     Type *SrcScalarTy = I->getOperand(0)->getType();
6847     Type *SrcVecTy =
6848         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6849     if (canTruncateToMinimalBitwidth(I, VF)) {
6850       // This cast is going to be shrunk. This may remove the cast or it might
6851       // turn it into slightly different cast. For example, if MinBW == 16,
6852       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6853       //
6854       // Calculate the modified src and dest types.
6855       Type *MinVecTy = VectorTy;
6856       if (Opcode == Instruction::Trunc) {
6857         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6858         VectorTy =
6859             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6860       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
6861         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6862         VectorTy =
6863             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6864       }
6865     }
6866 
6867     assert(!VF.isScalable() && "VF is assumed to be non scalable");
6868     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
6869     return N *
6870            TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
6871   }
6872   case Instruction::Call: {
6873     bool NeedToScalarize;
6874     CallInst *CI = cast<CallInst>(I);
6875     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6876     if (getVectorIntrinsicIDForCall(CI, TLI))
6877       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6878     return CallCost;
6879   }
6880   case Instruction::ExtractValue:
6881     return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
6882   default:
6883     // The cost of executing VF copies of the scalar instruction. This opcode
6884     // is unknown. Assume that it is the same as 'mul'.
6885     return VF.getKnownMinValue() * TTI.getArithmeticInstrCost(
6886                                        Instruction::Mul, VectorTy, CostKind) +
6887            getScalarizationOverhead(I, VF);
6888   } // end of switch.
6889 }
6890 
6891 char LoopVectorize::ID = 0;
6892 
6893 static const char lv_name[] = "Loop Vectorization";
6894 
6895 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
6896 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
6897 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
6898 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
6899 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
6900 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6901 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
6902 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6903 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
6904 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
6905 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
6906 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
6907 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
6908 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
6909 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
6910 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
6911 
6912 namespace llvm {
6913 
6914 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6915 
6916 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6917                               bool VectorizeOnlyWhenForced) {
6918   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6919 }
6920 
6921 } // end namespace llvm
6922 
6923 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6924   // Check if the pointer operand of a load or store instruction is
6925   // consecutive.
6926   if (auto *Ptr = getLoadStorePointerOperand(Inst))
6927     return Legal->isConsecutivePtr(Ptr);
6928   return false;
6929 }
6930 
6931 void LoopVectorizationCostModel::collectValuesToIgnore() {
6932   // Ignore ephemeral values.
6933   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6934 
6935   // Ignore type-promoting instructions we identified during reduction
6936   // detection.
6937   for (auto &Reduction : Legal->getReductionVars()) {
6938     RecurrenceDescriptor &RedDes = Reduction.second;
6939     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6940     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6941   }
6942   // Ignore type-casting instructions we identified during induction
6943   // detection.
6944   for (auto &Induction : Legal->getInductionVars()) {
6945     InductionDescriptor &IndDes = Induction.second;
6946     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6947     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6948   }
6949 }
6950 
6951 void LoopVectorizationCostModel::collectInLoopReductions() {
6952   for (auto &Reduction : Legal->getReductionVars()) {
6953     PHINode *Phi = Reduction.first;
6954     RecurrenceDescriptor &RdxDesc = Reduction.second;
6955 
6956     // We don't collect reductions that are type promoted (yet).
6957     if (RdxDesc.getRecurrenceType() != Phi->getType())
6958       continue;
6959 
6960     // If the target would prefer this reduction to happen "in-loop", then we
6961     // want to record it as such.
6962     unsigned Opcode = RdxDesc.getRecurrenceBinOp();
6963     if (!PreferInLoopReductions &&
6964         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
6965                                    TargetTransformInfo::ReductionFlags()))
6966       continue;
6967 
6968     // Check that we can correctly put the reductions into the loop, by
6969     // finding the chain of operations that leads from the phi to the loop
6970     // exit value.
6971     SmallVector<Instruction *, 4> ReductionOperations =
6972         RdxDesc.getReductionOpChain(Phi, TheLoop);
6973     bool InLoop = !ReductionOperations.empty();
6974     if (InLoop)
6975       InLoopReductionChains[Phi] = ReductionOperations;
6976     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
6977                       << " reduction for phi: " << *Phi << "\n");
6978   }
6979 }
6980 
6981 // TODO: we could return a pair of values that specify the max VF and
6982 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6983 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6984 // doesn't have a cost model that can choose which plan to execute if
6985 // more than one is generated.
6986 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6987                                  LoopVectorizationCostModel &CM) {
6988   unsigned WidestType;
6989   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6990   return WidestVectorRegBits / WidestType;
6991 }
6992 
6993 VectorizationFactor
6994 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
6995   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
6996   ElementCount VF = UserVF;
6997   // Outer loop handling: They may require CFG and instruction level
6998   // transformations before even evaluating whether vectorization is profitable.
6999   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7000   // the vectorization pipeline.
7001   if (!OrigLoop->isInnermost()) {
7002     // If the user doesn't provide a vectorization factor, determine a
7003     // reasonable one.
7004     if (UserVF.isZero()) {
7005       VF = ElementCount::getFixed(
7006           determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM));
7007       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7008 
7009       // Make sure we have a VF > 1 for stress testing.
7010       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7011         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7012                           << "overriding computed VF.\n");
7013         VF = ElementCount::getFixed(4);
7014       }
7015     }
7016     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7017     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7018            "VF needs to be a power of two");
7019     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7020                       << "VF " << VF << " to build VPlans.\n");
7021     buildVPlans(VF, VF);
7022 
7023     // For VPlan build stress testing, we bail out after VPlan construction.
7024     if (VPlanBuildStressTest)
7025       return VectorizationFactor::Disabled();
7026 
7027     return {VF, 0 /*Cost*/};
7028   }
7029 
7030   LLVM_DEBUG(
7031       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7032                 "VPlan-native path.\n");
7033   return VectorizationFactor::Disabled();
7034 }
7035 
7036 Optional<VectorizationFactor>
7037 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7038   assert(!UserVF.isScalable() && "scalable vectorization not yet handled");
7039   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7040   Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC);
7041   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
7042     return None;
7043 
7044   // Invalidate interleave groups if all blocks of loop will be predicated.
7045   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
7046       !useMaskedInterleavedAccesses(*TTI)) {
7047     LLVM_DEBUG(
7048         dbgs()
7049         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7050            "which requires masked-interleaved support.\n");
7051     if (CM.InterleaveInfo.invalidateGroups())
7052       // Invalidating interleave groups also requires invalidating all decisions
7053       // based on them, which includes widening decisions and uniform and scalar
7054       // values.
7055       CM.invalidateCostModelingDecisions();
7056   }
7057 
7058   ElementCount MaxVF = MaybeMaxVF.getValue();
7059   assert(MaxVF.isNonZero() && "MaxVF is zero.");
7060 
7061   if (!UserVF.isZero() && UserVF.getFixedValue() <= MaxVF.getFixedValue()) {
7062     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7063     assert(isPowerOf2_32(UserVF.getFixedValue()) &&
7064            "VF needs to be a power of two");
7065     // Collect the instructions (and their associated costs) that will be more
7066     // profitable to scalarize.
7067     CM.selectUserVectorizationFactor(UserVF);
7068     CM.collectInLoopReductions();
7069     buildVPlansWithVPRecipes(UserVF, UserVF);
7070     LLVM_DEBUG(printPlans(dbgs()));
7071     return {{UserVF, 0}};
7072   }
7073 
7074   for (ElementCount VF = ElementCount::getFixed(1);
7075        ElementCount::isKnownLE(VF, MaxVF); VF *= 2) {
7076     // Collect Uniform and Scalar instructions after vectorization with VF.
7077     CM.collectUniformsAndScalars(VF);
7078 
7079     // Collect the instructions (and their associated costs) that will be more
7080     // profitable to scalarize.
7081     if (VF.isVector())
7082       CM.collectInstsToScalarize(VF);
7083   }
7084 
7085   CM.collectInLoopReductions();
7086 
7087   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF);
7088   LLVM_DEBUG(printPlans(dbgs()));
7089   if (MaxVF.isScalar())
7090     return VectorizationFactor::Disabled();
7091 
7092   // Select the optimal vectorization factor.
7093   return CM.selectVectorizationFactor(MaxVF);
7094 }
7095 
7096 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) {
7097   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
7098                     << '\n');
7099   BestVF = VF;
7100   BestUF = UF;
7101 
7102   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
7103     return !Plan->hasVF(VF);
7104   });
7105   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
7106 }
7107 
7108 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
7109                                            DominatorTree *DT) {
7110   // Perform the actual loop transformation.
7111 
7112   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
7113   VPCallbackILV CallbackILV(ILV);
7114 
7115   assert(BestVF.hasValue() && "Vectorization Factor is missing");
7116 
7117   VPTransformState State{*BestVF, BestUF,      LI,
7118                          DT,      ILV.Builder, ILV.VectorLoopValueMap,
7119                          &ILV,    CallbackILV};
7120   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
7121   State.TripCount = ILV.getOrCreateTripCount(nullptr);
7122   State.CanonicalIV = ILV.Induction;
7123 
7124   //===------------------------------------------------===//
7125   //
7126   // Notice: any optimization or new instruction that go
7127   // into the code below should also be implemented in
7128   // the cost-model.
7129   //
7130   //===------------------------------------------------===//
7131 
7132   // 2. Copy and widen instructions from the old loop into the new loop.
7133   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
7134   VPlans.front()->execute(&State);
7135 
7136   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7137   //    predication, updating analyses.
7138   ILV.fixVectorizedLoop();
7139 }
7140 
7141 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
7142     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7143   BasicBlock *Latch = OrigLoop->getLoopLatch();
7144 
7145   // We create new control-flow for the vectorized loop, so the original
7146   // condition will be dead after vectorization if it's only used by the
7147   // branch.
7148   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
7149   if (Cmp && Cmp->hasOneUse()) {
7150     DeadInstructions.insert(Cmp);
7151 
7152     // The operands of the icmp is often a dead trunc, used by IndUpdate.
7153     for (Value *Op : Cmp->operands()) {
7154       if (isa<TruncInst>(Op) && Op->hasOneUse())
7155           DeadInstructions.insert(cast<Instruction>(Op));
7156     }
7157   }
7158 
7159   // We create new "steps" for induction variable updates to which the original
7160   // induction variables map. An original update instruction will be dead if
7161   // all its users except the induction variable are dead.
7162   for (auto &Induction : Legal->getInductionVars()) {
7163     PHINode *Ind = Induction.first;
7164     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
7165 
7166     // If the tail is to be folded by masking, the primary induction variable,
7167     // if exists, isn't dead: it will be used for masking. Don't kill it.
7168     if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())
7169       continue;
7170 
7171     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
7172           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
7173         }))
7174       DeadInstructions.insert(IndUpdate);
7175 
7176     // We record as "Dead" also the type-casting instructions we had identified
7177     // during induction analysis. We don't need any handling for them in the
7178     // vectorized loop because we have proven that, under a proper runtime
7179     // test guarding the vectorized loop, the value of the phi, and the casted
7180     // value of the phi, are the same. The last instruction in this casting chain
7181     // will get its scalar/vector/widened def from the scalar/vector/widened def
7182     // of the respective phi node. Any other casts in the induction def-use chain
7183     // have no other uses outside the phi update chain, and will be ignored.
7184     InductionDescriptor &IndDes = Induction.second;
7185     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7186     DeadInstructions.insert(Casts.begin(), Casts.end());
7187   }
7188 }
7189 
7190 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
7191 
7192 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
7193 
7194 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
7195                                         Instruction::BinaryOps BinOp) {
7196   // When unrolling and the VF is 1, we only need to add a simple scalar.
7197   Type *Ty = Val->getType();
7198   assert(!Ty->isVectorTy() && "Val must be a scalar");
7199 
7200   if (Ty->isFloatingPointTy()) {
7201     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
7202 
7203     // Floating point operations had to be 'fast' to enable the unrolling.
7204     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
7205     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
7206   }
7207   Constant *C = ConstantInt::get(Ty, StartIdx);
7208   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
7209 }
7210 
7211 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7212   SmallVector<Metadata *, 4> MDs;
7213   // Reserve first location for self reference to the LoopID metadata node.
7214   MDs.push_back(nullptr);
7215   bool IsUnrollMetadata = false;
7216   MDNode *LoopID = L->getLoopID();
7217   if (LoopID) {
7218     // First find existing loop unrolling disable metadata.
7219     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7220       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7221       if (MD) {
7222         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7223         IsUnrollMetadata =
7224             S && S->getString().startswith("llvm.loop.unroll.disable");
7225       }
7226       MDs.push_back(LoopID->getOperand(i));
7227     }
7228   }
7229 
7230   if (!IsUnrollMetadata) {
7231     // Add runtime unroll disable metadata.
7232     LLVMContext &Context = L->getHeader()->getContext();
7233     SmallVector<Metadata *, 1> DisableOperands;
7234     DisableOperands.push_back(
7235         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7236     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7237     MDs.push_back(DisableNode);
7238     MDNode *NewLoopID = MDNode::get(Context, MDs);
7239     // Set operand 0 to refer to the loop id itself.
7240     NewLoopID->replaceOperandWith(0, NewLoopID);
7241     L->setLoopID(NewLoopID);
7242   }
7243 }
7244 
7245 bool LoopVectorizationPlanner::getDecisionAndClampRange(
7246     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
7247   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
7248   bool PredicateAtRangeStart = Predicate(Range.Start);
7249 
7250   for (ElementCount TmpVF = Range.Start * 2;
7251        ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
7252     if (Predicate(TmpVF) != PredicateAtRangeStart) {
7253       Range.End = TmpVF;
7254       break;
7255     }
7256 
7257   return PredicateAtRangeStart;
7258 }
7259 
7260 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
7261 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
7262 /// of VF's starting at a given VF and extending it as much as possible. Each
7263 /// vectorization decision can potentially shorten this sub-range during
7264 /// buildVPlan().
7265 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
7266                                            ElementCount MaxVF) {
7267   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
7268   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
7269     VFRange SubRange = {VF, MaxVFPlusOne};
7270     VPlans.push_back(buildVPlan(SubRange));
7271     VF = SubRange.End;
7272   }
7273 }
7274 
7275 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
7276                                          VPlanPtr &Plan) {
7277   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
7278 
7279   // Look for cached value.
7280   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
7281   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
7282   if (ECEntryIt != EdgeMaskCache.end())
7283     return ECEntryIt->second;
7284 
7285   VPValue *SrcMask = createBlockInMask(Src, Plan);
7286 
7287   // The terminator has to be a branch inst!
7288   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
7289   assert(BI && "Unexpected terminator found");
7290 
7291   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
7292     return EdgeMaskCache[Edge] = SrcMask;
7293 
7294   VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
7295   assert(EdgeMask && "No Edge Mask found for condition");
7296 
7297   if (BI->getSuccessor(0) != Dst)
7298     EdgeMask = Builder.createNot(EdgeMask);
7299 
7300   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
7301     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
7302 
7303   return EdgeMaskCache[Edge] = EdgeMask;
7304 }
7305 
7306 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
7307   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
7308 
7309   // Look for cached value.
7310   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
7311   if (BCEntryIt != BlockMaskCache.end())
7312     return BCEntryIt->second;
7313 
7314   // All-one mask is modelled as no-mask following the convention for masked
7315   // load/store/gather/scatter. Initialize BlockMask to no-mask.
7316   VPValue *BlockMask = nullptr;
7317 
7318   if (OrigLoop->getHeader() == BB) {
7319     if (!CM.blockNeedsPredication(BB))
7320       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
7321 
7322     // Create the block in mask as the first non-phi instruction in the block.
7323     VPBuilder::InsertPointGuard Guard(Builder);
7324     auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi();
7325     Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint);
7326 
7327     // Introduce the early-exit compare IV <= BTC to form header block mask.
7328     // This is used instead of IV < TC because TC may wrap, unlike BTC.
7329     // Start by constructing the desired canonical IV.
7330     VPValue *IV = nullptr;
7331     if (Legal->getPrimaryInduction())
7332       IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction());
7333     else {
7334       auto IVRecipe = new VPWidenCanonicalIVRecipe();
7335       Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint);
7336       IV = IVRecipe->getVPValue();
7337     }
7338     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
7339     bool TailFolded = !CM.isScalarEpilogueAllowed();
7340 
7341     if (TailFolded && CM.TTI.emitGetActiveLaneMask()) {
7342       // While ActiveLaneMask is a binary op that consumes the loop tripcount
7343       // as a second argument, we only pass the IV here and extract the
7344       // tripcount from the transform state where codegen of the VP instructions
7345       // happen.
7346       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV});
7347     } else {
7348       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
7349     }
7350     return BlockMaskCache[BB] = BlockMask;
7351   }
7352 
7353   // This is the block mask. We OR all incoming edges.
7354   for (auto *Predecessor : predecessors(BB)) {
7355     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
7356     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
7357       return BlockMaskCache[BB] = EdgeMask;
7358 
7359     if (!BlockMask) { // BlockMask has its initialized nullptr value.
7360       BlockMask = EdgeMask;
7361       continue;
7362     }
7363 
7364     BlockMask = Builder.createOr(BlockMask, EdgeMask);
7365   }
7366 
7367   return BlockMaskCache[BB] = BlockMask;
7368 }
7369 
7370 VPWidenMemoryInstructionRecipe *
7371 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
7372                                   VPlanPtr &Plan) {
7373   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7374          "Must be called with either a load or store");
7375 
7376   auto willWiden = [&](ElementCount VF) -> bool {
7377     assert(!VF.isScalable() && "unexpected scalable ElementCount");
7378     if (VF.isScalar())
7379       return false;
7380     LoopVectorizationCostModel::InstWidening Decision =
7381         CM.getWideningDecision(I, VF);
7382     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
7383            "CM decision should be taken at this point.");
7384     if (Decision == LoopVectorizationCostModel::CM_Interleave)
7385       return true;
7386     if (CM.isScalarAfterVectorization(I, VF) ||
7387         CM.isProfitableToScalarize(I, VF))
7388       return false;
7389     return Decision != LoopVectorizationCostModel::CM_Scalarize;
7390   };
7391 
7392   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
7393     return nullptr;
7394 
7395   VPValue *Mask = nullptr;
7396   if (Legal->isMaskRequired(I))
7397     Mask = createBlockInMask(I->getParent(), Plan);
7398 
7399   VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
7400   if (LoadInst *Load = dyn_cast<LoadInst>(I))
7401     return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask);
7402 
7403   StoreInst *Store = cast<StoreInst>(I);
7404   VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand());
7405   return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask);
7406 }
7407 
7408 VPWidenIntOrFpInductionRecipe *
7409 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const {
7410   // Check if this is an integer or fp induction. If so, build the recipe that
7411   // produces its scalar and vector values.
7412   InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
7413   if (II.getKind() == InductionDescriptor::IK_IntInduction ||
7414       II.getKind() == InductionDescriptor::IK_FpInduction)
7415     return new VPWidenIntOrFpInductionRecipe(Phi);
7416 
7417   return nullptr;
7418 }
7419 
7420 VPWidenIntOrFpInductionRecipe *
7421 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I,
7422                                                 VFRange &Range) const {
7423   // Optimize the special case where the source is a constant integer
7424   // induction variable. Notice that we can only optimize the 'trunc' case
7425   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
7426   // (c) other casts depend on pointer size.
7427 
7428   // Determine whether \p K is a truncation based on an induction variable that
7429   // can be optimized.
7430   auto isOptimizableIVTruncate =
7431       [&](Instruction *K) -> std::function<bool(ElementCount)> {
7432     return [=](ElementCount VF) -> bool {
7433       return CM.isOptimizableIVTruncate(K, VF);
7434     };
7435   };
7436 
7437   if (LoopVectorizationPlanner::getDecisionAndClampRange(
7438           isOptimizableIVTruncate(I), Range))
7439     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
7440                                              I);
7441   return nullptr;
7442 }
7443 
7444 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) {
7445   // We know that all PHIs in non-header blocks are converted into selects, so
7446   // we don't have to worry about the insertion order and we can just use the
7447   // builder. At this point we generate the predication tree. There may be
7448   // duplications since this is a simple recursive scan, but future
7449   // optimizations will clean it up.
7450 
7451   SmallVector<VPValue *, 2> Operands;
7452   unsigned NumIncoming = Phi->getNumIncomingValues();
7453   for (unsigned In = 0; In < NumIncoming; In++) {
7454     VPValue *EdgeMask =
7455       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
7456     assert((EdgeMask || NumIncoming == 1) &&
7457            "Multiple predecessors with one having a full mask");
7458     Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In)));
7459     if (EdgeMask)
7460       Operands.push_back(EdgeMask);
7461   }
7462   return new VPBlendRecipe(Phi, Operands);
7463 }
7464 
7465 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range,
7466                                                    VPlan &Plan) const {
7467 
7468   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
7469       [this, CI](ElementCount VF) {
7470         return CM.isScalarWithPredication(CI, VF);
7471       },
7472       Range);
7473 
7474   if (IsPredicated)
7475     return nullptr;
7476 
7477   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7478   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
7479              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
7480              ID == Intrinsic::pseudoprobe))
7481     return nullptr;
7482 
7483   auto willWiden = [&](ElementCount VF) -> bool {
7484     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7485     // The following case may be scalarized depending on the VF.
7486     // The flag shows whether we use Intrinsic or a usual Call for vectorized
7487     // version of the instruction.
7488     // Is it beneficial to perform intrinsic call compared to lib call?
7489     bool NeedToScalarize = false;
7490     unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
7491     bool UseVectorIntrinsic =
7492         ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
7493     return UseVectorIntrinsic || !NeedToScalarize;
7494   };
7495 
7496   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
7497     return nullptr;
7498 
7499   return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands()));
7500 }
7501 
7502 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
7503   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
7504          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
7505   // Instruction should be widened, unless it is scalar after vectorization,
7506   // scalarization is profitable or it is predicated.
7507   auto WillScalarize = [this, I](ElementCount VF) -> bool {
7508     return CM.isScalarAfterVectorization(I, VF) ||
7509            CM.isProfitableToScalarize(I, VF) ||
7510            CM.isScalarWithPredication(I, VF);
7511   };
7512   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
7513                                                              Range);
7514 }
7515 
7516 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const {
7517   auto IsVectorizableOpcode = [](unsigned Opcode) {
7518     switch (Opcode) {
7519     case Instruction::Add:
7520     case Instruction::And:
7521     case Instruction::AShr:
7522     case Instruction::BitCast:
7523     case Instruction::FAdd:
7524     case Instruction::FCmp:
7525     case Instruction::FDiv:
7526     case Instruction::FMul:
7527     case Instruction::FNeg:
7528     case Instruction::FPExt:
7529     case Instruction::FPToSI:
7530     case Instruction::FPToUI:
7531     case Instruction::FPTrunc:
7532     case Instruction::FRem:
7533     case Instruction::FSub:
7534     case Instruction::ICmp:
7535     case Instruction::IntToPtr:
7536     case Instruction::LShr:
7537     case Instruction::Mul:
7538     case Instruction::Or:
7539     case Instruction::PtrToInt:
7540     case Instruction::SDiv:
7541     case Instruction::Select:
7542     case Instruction::SExt:
7543     case Instruction::Shl:
7544     case Instruction::SIToFP:
7545     case Instruction::SRem:
7546     case Instruction::Sub:
7547     case Instruction::Trunc:
7548     case Instruction::UDiv:
7549     case Instruction::UIToFP:
7550     case Instruction::URem:
7551     case Instruction::Xor:
7552     case Instruction::ZExt:
7553       return true;
7554     }
7555     return false;
7556   };
7557 
7558   if (!IsVectorizableOpcode(I->getOpcode()))
7559     return nullptr;
7560 
7561   // Success: widen this instruction.
7562   return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands()));
7563 }
7564 
7565 VPBasicBlock *VPRecipeBuilder::handleReplication(
7566     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
7567     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
7568     VPlanPtr &Plan) {
7569   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
7570       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
7571       Range);
7572 
7573   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
7574       [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); },
7575       Range);
7576 
7577   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
7578                                        IsUniform, IsPredicated);
7579   setRecipe(I, Recipe);
7580   Plan->addVPValue(I, Recipe);
7581 
7582   // Find if I uses a predicated instruction. If so, it will use its scalar
7583   // value. Avoid hoisting the insert-element which packs the scalar value into
7584   // a vector value, as that happens iff all users use the vector value.
7585   for (auto &Op : I->operands())
7586     if (auto *PredInst = dyn_cast<Instruction>(Op))
7587       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
7588         PredInst2Recipe[PredInst]->setAlsoPack(false);
7589 
7590   // Finalize the recipe for Instr, first if it is not predicated.
7591   if (!IsPredicated) {
7592     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
7593     VPBB->appendRecipe(Recipe);
7594     return VPBB;
7595   }
7596   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
7597   assert(VPBB->getSuccessors().empty() &&
7598          "VPBB has successors when handling predicated replication.");
7599   // Record predicated instructions for above packing optimizations.
7600   PredInst2Recipe[I] = Recipe;
7601   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
7602   VPBlockUtils::insertBlockAfter(Region, VPBB);
7603   auto *RegSucc = new VPBasicBlock();
7604   VPBlockUtils::insertBlockAfter(RegSucc, Region);
7605   return RegSucc;
7606 }
7607 
7608 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
7609                                                       VPRecipeBase *PredRecipe,
7610                                                       VPlanPtr &Plan) {
7611   // Instructions marked for predication are replicated and placed under an
7612   // if-then construct to prevent side-effects.
7613 
7614   // Generate recipes to compute the block mask for this region.
7615   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
7616 
7617   // Build the triangular if-then region.
7618   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
7619   assert(Instr->getParent() && "Predicated instruction not in any basic block");
7620   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
7621   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
7622   auto *PHIRecipe = Instr->getType()->isVoidTy()
7623                         ? nullptr
7624                         : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));
7625   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
7626   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
7627   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
7628 
7629   // Note: first set Entry as region entry and then connect successors starting
7630   // from it in order, to propagate the "parent" of each VPBasicBlock.
7631   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
7632   VPBlockUtils::connectBlocks(Pred, Exit);
7633 
7634   return Region;
7635 }
7636 
7637 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
7638                                                       VFRange &Range,
7639                                                       VPlanPtr &Plan) {
7640   // First, check for specific widening recipes that deal with calls, memory
7641   // operations, inductions and Phi nodes.
7642   if (auto *CI = dyn_cast<CallInst>(Instr))
7643     return tryToWidenCall(CI, Range, *Plan);
7644 
7645   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
7646     return tryToWidenMemory(Instr, Range, Plan);
7647 
7648   VPRecipeBase *Recipe;
7649   if (auto Phi = dyn_cast<PHINode>(Instr)) {
7650     if (Phi->getParent() != OrigLoop->getHeader())
7651       return tryToBlend(Phi, Plan);
7652     if ((Recipe = tryToOptimizeInductionPHI(Phi)))
7653       return Recipe;
7654     return new VPWidenPHIRecipe(Phi);
7655   }
7656 
7657   if (isa<TruncInst>(Instr) &&
7658       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range)))
7659     return Recipe;
7660 
7661   if (!shouldWiden(Instr, Range))
7662     return nullptr;
7663 
7664   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
7665     return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()),
7666                                 OrigLoop);
7667 
7668   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
7669     bool InvariantCond =
7670         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
7671     return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()),
7672                                    InvariantCond);
7673   }
7674 
7675   return tryToWiden(Instr, *Plan);
7676 }
7677 
7678 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
7679                                                         ElementCount MaxVF) {
7680   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7681 
7682   // Collect instructions from the original loop that will become trivially dead
7683   // in the vectorized loop. We don't need to vectorize these instructions. For
7684   // example, original induction update instructions can become dead because we
7685   // separately emit induction "steps" when generating code for the new loop.
7686   // Similarly, we create a new latch condition when setting up the structure
7687   // of the new loop, so the old one can become dead.
7688   SmallPtrSet<Instruction *, 4> DeadInstructions;
7689   collectTriviallyDeadInstructions(DeadInstructions);
7690 
7691   // Add assume instructions we need to drop to DeadInstructions, to prevent
7692   // them from being added to the VPlan.
7693   // TODO: We only need to drop assumes in blocks that get flattend. If the
7694   // control flow is preserved, we should keep them.
7695   auto &ConditionalAssumes = Legal->getConditionalAssumes();
7696   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
7697 
7698   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
7699   // Dead instructions do not need sinking. Remove them from SinkAfter.
7700   for (Instruction *I : DeadInstructions)
7701     SinkAfter.erase(I);
7702 
7703   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
7704   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
7705     VFRange SubRange = {VF, MaxVFPlusOne};
7706     VPlans.push_back(
7707         buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
7708     VF = SubRange.End;
7709   }
7710 }
7711 
7712 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
7713     VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
7714     const DenseMap<Instruction *, Instruction *> &SinkAfter) {
7715 
7716   // Hold a mapping from predicated instructions to their recipes, in order to
7717   // fix their AlsoPack behavior if a user is determined to replicate and use a
7718   // scalar instead of vector value.
7719   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
7720 
7721   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
7722 
7723   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
7724 
7725   // ---------------------------------------------------------------------------
7726   // Pre-construction: record ingredients whose recipes we'll need to further
7727   // process after constructing the initial VPlan.
7728   // ---------------------------------------------------------------------------
7729 
7730   // Mark instructions we'll need to sink later and their targets as
7731   // ingredients whose recipe we'll need to record.
7732   for (auto &Entry : SinkAfter) {
7733     RecipeBuilder.recordRecipeOf(Entry.first);
7734     RecipeBuilder.recordRecipeOf(Entry.second);
7735   }
7736   for (auto &Reduction : CM.getInLoopReductionChains()) {
7737     PHINode *Phi = Reduction.first;
7738     RecurrenceDescriptor::RecurrenceKind Kind =
7739         Legal->getReductionVars()[Phi].getRecurrenceKind();
7740     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
7741 
7742     RecipeBuilder.recordRecipeOf(Phi);
7743     for (auto &R : ReductionOperations) {
7744       RecipeBuilder.recordRecipeOf(R);
7745       // For min/max reducitons, where we have a pair of icmp/select, we also
7746       // need to record the ICmp recipe, so it can be removed later.
7747       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
7748           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
7749         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
7750       }
7751     }
7752   }
7753 
7754   // For each interleave group which is relevant for this (possibly trimmed)
7755   // Range, add it to the set of groups to be later applied to the VPlan and add
7756   // placeholders for its members' Recipes which we'll be replacing with a
7757   // single VPInterleaveRecipe.
7758   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
7759     auto applyIG = [IG, this](ElementCount VF) -> bool {
7760       return (VF.isVector() && // Query is illegal for VF == 1
7761               CM.getWideningDecision(IG->getInsertPos(), VF) ==
7762                   LoopVectorizationCostModel::CM_Interleave);
7763     };
7764     if (!getDecisionAndClampRange(applyIG, Range))
7765       continue;
7766     InterleaveGroups.insert(IG);
7767     for (unsigned i = 0; i < IG->getFactor(); i++)
7768       if (Instruction *Member = IG->getMember(i))
7769         RecipeBuilder.recordRecipeOf(Member);
7770   };
7771 
7772   // ---------------------------------------------------------------------------
7773   // Build initial VPlan: Scan the body of the loop in a topological order to
7774   // visit each basic block after having visited its predecessor basic blocks.
7775   // ---------------------------------------------------------------------------
7776 
7777   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
7778   auto Plan = std::make_unique<VPlan>();
7779   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
7780   Plan->setEntry(VPBB);
7781 
7782   // Scan the body of the loop in a topological order to visit each basic block
7783   // after having visited its predecessor basic blocks.
7784   LoopBlocksDFS DFS(OrigLoop);
7785   DFS.perform(LI);
7786 
7787   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
7788     // Relevant instructions from basic block BB will be grouped into VPRecipe
7789     // ingredients and fill a new VPBasicBlock.
7790     unsigned VPBBsForBB = 0;
7791     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
7792     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
7793     VPBB = FirstVPBBForBB;
7794     Builder.setInsertPoint(VPBB);
7795 
7796     // Introduce each ingredient into VPlan.
7797     // TODO: Model and preserve debug instrinsics in VPlan.
7798     for (Instruction &I : BB->instructionsWithoutDebug()) {
7799       Instruction *Instr = &I;
7800 
7801       // First filter out irrelevant instructions, to ensure no recipes are
7802       // built for them.
7803       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
7804         continue;
7805 
7806       if (auto Recipe =
7807               RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) {
7808         // Check if the recipe can be converted to a VPValue. We need the extra
7809         // down-casting step until VPRecipeBase inherits from VPValue.
7810         VPValue *MaybeVPValue = Recipe->toVPValue();
7811         if (!Instr->getType()->isVoidTy() && MaybeVPValue)
7812           Plan->addVPValue(Instr, MaybeVPValue);
7813 
7814         RecipeBuilder.setRecipe(Instr, Recipe);
7815         VPBB->appendRecipe(Recipe);
7816         continue;
7817       }
7818 
7819       // Otherwise, if all widening options failed, Instruction is to be
7820       // replicated. This may create a successor for VPBB.
7821       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
7822           Instr, Range, VPBB, PredInst2Recipe, Plan);
7823       if (NextVPBB != VPBB) {
7824         VPBB = NextVPBB;
7825         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
7826                                     : "");
7827       }
7828     }
7829   }
7830 
7831   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7832   // may also be empty, such as the last one VPBB, reflecting original
7833   // basic-blocks with no recipes.
7834   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
7835   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
7836   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
7837   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
7838   delete PreEntry;
7839 
7840   // ---------------------------------------------------------------------------
7841   // Transform initial VPlan: Apply previously taken decisions, in order, to
7842   // bring the VPlan to its final state.
7843   // ---------------------------------------------------------------------------
7844 
7845   // Apply Sink-After legal constraints.
7846   for (auto &Entry : SinkAfter) {
7847     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
7848     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
7849     Sink->moveAfter(Target);
7850   }
7851 
7852   // Interleave memory: for each Interleave Group we marked earlier as relevant
7853   // for this VPlan, replace the Recipes widening its memory instructions with a
7854   // single VPInterleaveRecipe at its insertion point.
7855   for (auto IG : InterleaveGroups) {
7856     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
7857         RecipeBuilder.getRecipe(IG->getInsertPos()));
7858     SmallVector<VPValue *, 4> StoredValues;
7859     for (unsigned i = 0; i < IG->getFactor(); ++i)
7860       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i)))
7861         StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0)));
7862 
7863     (new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
7864                             Recipe->getMask()))
7865         ->insertBefore(Recipe);
7866 
7867     for (unsigned i = 0; i < IG->getFactor(); ++i)
7868       if (Instruction *Member = IG->getMember(i)) {
7869         if (!Member->getType()->isVoidTy()) {
7870           VPValue *OriginalV = Plan->getVPValue(Member);
7871           Plan->removeVPValueFor(Member);
7872           OriginalV->replaceAllUsesWith(Plan->getOrAddVPValue(Member));
7873         }
7874         RecipeBuilder.getRecipe(Member)->eraseFromParent();
7875       }
7876   }
7877 
7878   // Adjust the recipes for any inloop reductions.
7879   if (Range.Start.isVector())
7880     adjustRecipesForInLoopReductions(Plan, RecipeBuilder);
7881 
7882   // Finally, if tail is folded by masking, introduce selects between the phi
7883   // and the live-out instruction of each reduction, at the end of the latch.
7884   if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) {
7885     Builder.setInsertPoint(VPBB);
7886     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
7887     for (auto &Reduction : Legal->getReductionVars()) {
7888       if (CM.isInLoopReduction(Reduction.first))
7889         continue;
7890       VPValue *Phi = Plan->getOrAddVPValue(Reduction.first);
7891       VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr());
7892       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
7893     }
7894   }
7895 
7896   std::string PlanName;
7897   raw_string_ostream RSO(PlanName);
7898   ElementCount VF = Range.Start;
7899   Plan->addVF(VF);
7900   RSO << "Initial VPlan for VF={" << VF;
7901   for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
7902     Plan->addVF(VF);
7903     RSO << "," << VF;
7904   }
7905   RSO << "},UF>=1";
7906   RSO.flush();
7907   Plan->setName(PlanName);
7908 
7909   return Plan;
7910 }
7911 
7912 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
7913   // Outer loop handling: They may require CFG and instruction level
7914   // transformations before even evaluating whether vectorization is profitable.
7915   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7916   // the vectorization pipeline.
7917   assert(!OrigLoop->isInnermost());
7918   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7919 
7920   // Create new empty VPlan
7921   auto Plan = std::make_unique<VPlan>();
7922 
7923   // Build hierarchical CFG
7924   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
7925   HCFGBuilder.buildHierarchicalCFG();
7926 
7927   for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
7928        VF *= 2)
7929     Plan->addVF(VF);
7930 
7931   if (EnableVPlanPredication) {
7932     VPlanPredicator VPP(*Plan);
7933     VPP.predicate();
7934 
7935     // Avoid running transformation to recipes until masked code generation in
7936     // VPlan-native path is in place.
7937     return Plan;
7938   }
7939 
7940   SmallPtrSet<Instruction *, 1> DeadInstructions;
7941   VPlanTransforms::VPInstructionsToVPRecipes(
7942       OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
7943   return Plan;
7944 }
7945 
7946 // Adjust the recipes for any inloop reductions. The chain of instructions
7947 // leading from the loop exit instr to the phi need to be converted to
7948 // reductions, with one operand being vector and the other being the scalar
7949 // reduction chain.
7950 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions(
7951     VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) {
7952   for (auto &Reduction : CM.getInLoopReductionChains()) {
7953     PHINode *Phi = Reduction.first;
7954     RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
7955     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
7956 
7957     // ReductionOperations are orders top-down from the phi's use to the
7958     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
7959     // which of the two operands will remain scalar and which will be reduced.
7960     // For minmax the chain will be the select instructions.
7961     Instruction *Chain = Phi;
7962     for (Instruction *R : ReductionOperations) {
7963       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
7964       RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc.getRecurrenceKind();
7965 
7966       VPValue *ChainOp = Plan->getVPValue(Chain);
7967       unsigned FirstOpId;
7968       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
7969           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
7970         assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
7971                "Expected to replace a VPWidenSelectSC");
7972         FirstOpId = 1;
7973       } else {
7974         assert(isa<VPWidenRecipe>(WidenRecipe) &&
7975                "Expected to replace a VPWidenSC");
7976         FirstOpId = 0;
7977       }
7978       unsigned VecOpId =
7979           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
7980       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
7981 
7982       auto *CondOp = CM.foldTailByMasking()
7983                          ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
7984                          : nullptr;
7985       VPReductionRecipe *RedRecipe = new VPReductionRecipe(
7986           &RdxDesc, R, ChainOp, VecOp, CondOp, Legal->hasFunNoNaNAttr(), TTI);
7987       WidenRecipe->toVPValue()->replaceAllUsesWith(RedRecipe);
7988       Plan->removeVPValueFor(R);
7989       Plan->addVPValue(R, RedRecipe);
7990       WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
7991       WidenRecipe->eraseFromParent();
7992 
7993       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
7994           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
7995         VPRecipeBase *CompareRecipe =
7996             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
7997         assert(isa<VPWidenRecipe>(CompareRecipe) &&
7998                "Expected to replace a VPWidenSC");
7999         assert(CompareRecipe->toVPValue()->getNumUsers() == 0 &&
8000                "Expected no remaining users");
8001         CompareRecipe->eraseFromParent();
8002       }
8003       Chain = R;
8004     }
8005   }
8006 }
8007 
8008 Value* LoopVectorizationPlanner::VPCallbackILV::
8009 getOrCreateVectorValues(Value *V, unsigned Part) {
8010       return ILV.getOrCreateVectorValue(V, Part);
8011 }
8012 
8013 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
8014     Value *V, const VPIteration &Instance) {
8015   return ILV.getOrCreateScalarValue(V, Instance);
8016 }
8017 
8018 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
8019                                VPSlotTracker &SlotTracker) const {
8020   O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
8021   IG->getInsertPos()->printAsOperand(O, false);
8022   O << ", ";
8023   getAddr()->printAsOperand(O, SlotTracker);
8024   VPValue *Mask = getMask();
8025   if (Mask) {
8026     O << ", ";
8027     Mask->printAsOperand(O, SlotTracker);
8028   }
8029   for (unsigned i = 0; i < IG->getFactor(); ++i)
8030     if (Instruction *I = IG->getMember(i))
8031       O << "\\l\" +\n" << Indent << "\"  " << VPlanIngredient(I) << " " << i;
8032 }
8033 
8034 void VPWidenCallRecipe::execute(VPTransformState &State) {
8035   State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
8036                                   *this, State);
8037 }
8038 
8039 void VPWidenSelectRecipe::execute(VPTransformState &State) {
8040   State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()),
8041                                     this, *this, InvariantCond, State);
8042 }
8043 
8044 void VPWidenRecipe::execute(VPTransformState &State) {
8045   State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State);
8046 }
8047 
8048 void VPWidenGEPRecipe::execute(VPTransformState &State) {
8049   State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this,
8050                       *this, State.UF, State.VF, IsPtrLoopInvariant,
8051                       IsIndexLoopInvariant, State);
8052 }
8053 
8054 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
8055   assert(!State.Instance && "Int or FP induction being replicated.");
8056   State.ILV->widenIntOrFpInduction(IV, Trunc);
8057 }
8058 
8059 void VPWidenPHIRecipe::execute(VPTransformState &State) {
8060   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
8061 }
8062 
8063 void VPBlendRecipe::execute(VPTransformState &State) {
8064   State.ILV->setDebugLocFromInst(State.Builder, Phi);
8065   // We know that all PHIs in non-header blocks are converted into
8066   // selects, so we don't have to worry about the insertion order and we
8067   // can just use the builder.
8068   // At this point we generate the predication tree. There may be
8069   // duplications since this is a simple recursive scan, but future
8070   // optimizations will clean it up.
8071 
8072   unsigned NumIncoming = getNumIncomingValues();
8073 
8074   // Generate a sequence of selects of the form:
8075   // SELECT(Mask3, In3,
8076   //        SELECT(Mask2, In2,
8077   //               SELECT(Mask1, In1,
8078   //                      In0)))
8079   // Note that Mask0 is never used: lanes for which no path reaches this phi and
8080   // are essentially undef are taken from In0.
8081   InnerLoopVectorizer::VectorParts Entry(State.UF);
8082   for (unsigned In = 0; In < NumIncoming; ++In) {
8083     for (unsigned Part = 0; Part < State.UF; ++Part) {
8084       // We might have single edge PHIs (blocks) - use an identity
8085       // 'select' for the first PHI operand.
8086       Value *In0 = State.get(getIncomingValue(In), Part);
8087       if (In == 0)
8088         Entry[Part] = In0; // Initialize with the first incoming value.
8089       else {
8090         // Select between the current value and the previous incoming edge
8091         // based on the incoming mask.
8092         Value *Cond = State.get(getMask(In), Part);
8093         Entry[Part] =
8094             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
8095       }
8096     }
8097   }
8098   for (unsigned Part = 0; Part < State.UF; ++Part)
8099     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
8100 }
8101 
8102 void VPInterleaveRecipe::execute(VPTransformState &State) {
8103   assert(!State.Instance && "Interleave group being replicated.");
8104   State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getStoredValues(),
8105                                       getMask());
8106 }
8107 
8108 void VPReductionRecipe::execute(VPTransformState &State) {
8109   assert(!State.Instance && "Reduction being replicated.");
8110   for (unsigned Part = 0; Part < State.UF; ++Part) {
8111     RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc->getRecurrenceKind();
8112     Value *NewVecOp = State.get(getVecOp(), Part);
8113     if (VPValue *Cond = getCondOp()) {
8114       Value *NewCond = State.get(Cond, Part);
8115       VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
8116       Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
8117           Kind, RdxDesc->getMinMaxRecurrenceKind(), VecTy->getElementType());
8118       Constant *IdenVec =
8119           ConstantVector::getSplat(VecTy->getElementCount(), Iden);
8120       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
8121       NewVecOp = Select;
8122     }
8123     Value *NewRed =
8124         createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp, NoNaN);
8125     Value *PrevInChain = State.get(getChainOp(), Part);
8126     Value *NextInChain;
8127     if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
8128         Kind == RecurrenceDescriptor::RK_FloatMinMax) {
8129       NextInChain =
8130           createMinMaxOp(State.Builder, RdxDesc->getMinMaxRecurrenceKind(),
8131                          NewRed, PrevInChain);
8132     } else {
8133       NextInChain = State.Builder.CreateBinOp(
8134           (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed,
8135           PrevInChain);
8136     }
8137     State.set(this, getUnderlyingInstr(), NextInChain, Part);
8138   }
8139 }
8140 
8141 void VPReplicateRecipe::execute(VPTransformState &State) {
8142   if (State.Instance) { // Generate a single instance.
8143     State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this,
8144                                     *State.Instance, IsPredicated, State);
8145     // Insert scalar instance packing it into a vector.
8146     if (AlsoPack && State.VF.isVector()) {
8147       // If we're constructing lane 0, initialize to start from undef.
8148       if (State.Instance->Lane == 0) {
8149         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
8150         Value *Undef = UndefValue::get(
8151             VectorType::get(getUnderlyingValue()->getType(), State.VF));
8152         State.ValueMap.setVectorValue(getUnderlyingInstr(),
8153                                       State.Instance->Part, Undef);
8154       }
8155       State.ILV->packScalarIntoVectorValue(getUnderlyingInstr(),
8156                                            *State.Instance);
8157     }
8158     return;
8159   }
8160 
8161   // Generate scalar instances for all VF lanes of all UF parts, unless the
8162   // instruction is uniform inwhich case generate only the first lane for each
8163   // of the UF parts.
8164   unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
8165   for (unsigned Part = 0; Part < State.UF; ++Part)
8166     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
8167       State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, {Part, Lane},
8168                                       IsPredicated, State);
8169 }
8170 
8171 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
8172   assert(State.Instance && "Branch on Mask works only on single instance.");
8173 
8174   unsigned Part = State.Instance->Part;
8175   unsigned Lane = State.Instance->Lane;
8176 
8177   Value *ConditionBit = nullptr;
8178   VPValue *BlockInMask = getMask();
8179   if (BlockInMask) {
8180     ConditionBit = State.get(BlockInMask, Part);
8181     if (ConditionBit->getType()->isVectorTy())
8182       ConditionBit = State.Builder.CreateExtractElement(
8183           ConditionBit, State.Builder.getInt32(Lane));
8184   } else // Block in mask is all-one.
8185     ConditionBit = State.Builder.getTrue();
8186 
8187   // Replace the temporary unreachable terminator with a new conditional branch,
8188   // whose two destinations will be set later when they are created.
8189   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
8190   assert(isa<UnreachableInst>(CurrentTerminator) &&
8191          "Expected to replace unreachable terminator with conditional branch.");
8192   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
8193   CondBr->setSuccessor(0, nullptr);
8194   ReplaceInstWithInst(CurrentTerminator, CondBr);
8195 }
8196 
8197 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
8198   assert(State.Instance && "Predicated instruction PHI works per instance.");
8199   Instruction *ScalarPredInst =
8200       cast<Instruction>(State.get(getOperand(0), *State.Instance));
8201   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
8202   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
8203   assert(PredicatingBB && "Predicated block has no single predecessor.");
8204 
8205   // By current pack/unpack logic we need to generate only a single phi node: if
8206   // a vector value for the predicated instruction exists at this point it means
8207   // the instruction has vector users only, and a phi for the vector value is
8208   // needed. In this case the recipe of the predicated instruction is marked to
8209   // also do that packing, thereby "hoisting" the insert-element sequence.
8210   // Otherwise, a phi node for the scalar value is needed.
8211   unsigned Part = State.Instance->Part;
8212   Instruction *PredInst =
8213       cast<Instruction>(getOperand(0)->getUnderlyingValue());
8214   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
8215     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
8216     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
8217     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
8218     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
8219     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
8220     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
8221   } else {
8222     Type *PredInstType = PredInst->getType();
8223     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
8224     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
8225     Phi->addIncoming(ScalarPredInst, PredicatedBB);
8226     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
8227   }
8228 }
8229 
8230 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
8231   Instruction *Instr = getUnderlyingInstr();
8232   VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr;
8233   State.ILV->vectorizeMemoryInstruction(Instr, State,
8234                                         StoredValue ? nullptr : this, getAddr(),
8235                                         StoredValue, getMask());
8236 }
8237 
8238 // Determine how to lower the scalar epilogue, which depends on 1) optimising
8239 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
8240 // predication, and 4) a TTI hook that analyses whether the loop is suitable
8241 // for predication.
8242 static ScalarEpilogueLowering getScalarEpilogueLowering(
8243     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
8244     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
8245     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
8246     LoopVectorizationLegality &LVL) {
8247   // 1) OptSize takes precedence over all other options, i.e. if this is set,
8248   // don't look at hints or options, and don't request a scalar epilogue.
8249   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
8250   // LoopAccessInfo (due to code dependency and not being able to reliably get
8251   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
8252   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
8253   // versioning when the vectorization is forced, unlike hasOptSize. So revert
8254   // back to the old way and vectorize with versioning when forced. See D81345.)
8255   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
8256                                                       PGSOQueryType::IRPass) &&
8257                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
8258     return CM_ScalarEpilogueNotAllowedOptSize;
8259 
8260   bool PredicateOptDisabled = PreferPredicateOverEpilogue.getNumOccurrences() &&
8261                               !PreferPredicateOverEpilogue;
8262 
8263   // 2) Next, if disabling predication is requested on the command line, honour
8264   // this and request a scalar epilogue.
8265   if (PredicateOptDisabled)
8266     return CM_ScalarEpilogueAllowed;
8267 
8268   // 3) and 4) look if enabling predication is requested on the command line,
8269   // with a loop hint, or if the TTI hook indicates this is profitable, request
8270   // predication.
8271   if (PreferPredicateOverEpilogue ||
8272       Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
8273       (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
8274                                         LVL.getLAI()) &&
8275        Hints.getPredicate() != LoopVectorizeHints::FK_Disabled))
8276     return CM_ScalarEpilogueNotNeededUsePredicate;
8277 
8278   return CM_ScalarEpilogueAllowed;
8279 }
8280 
8281 void VPTransformState::set(VPValue *Def, Value *IRDef, Value *V,
8282                            unsigned Part) {
8283   set(Def, V, Part);
8284   ILV->setVectorValue(IRDef, Part, V);
8285 }
8286 
8287 // Process the loop in the VPlan-native vectorization path. This path builds
8288 // VPlan upfront in the vectorization pipeline, which allows to apply
8289 // VPlan-to-VPlan transformations from the very beginning without modifying the
8290 // input LLVM IR.
8291 static bool processLoopInVPlanNativePath(
8292     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
8293     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
8294     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
8295     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
8296     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
8297 
8298   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
8299     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
8300     return false;
8301   }
8302   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
8303   Function *F = L->getHeader()->getParent();
8304   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
8305 
8306   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
8307       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
8308 
8309   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
8310                                 &Hints, IAI);
8311   // Use the planner for outer loop vectorization.
8312   // TODO: CM is not used at this point inside the planner. Turn CM into an
8313   // optional argument if we don't need it in the future.
8314   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE);
8315 
8316   // Get user vectorization factor.
8317   const unsigned UserVF = Hints.getWidth();
8318 
8319   // Plan how to best vectorize, return the best VF and its cost.
8320   const VectorizationFactor VF =
8321       LVP.planInVPlanNativePath(ElementCount::getFixed(UserVF));
8322 
8323   // If we are stress testing VPlan builds, do not attempt to generate vector
8324   // code. Masked vector code generation support will follow soon.
8325   // Also, do not attempt to vectorize if no vector code will be produced.
8326   if (VPlanBuildStressTest || EnableVPlanPredication ||
8327       VectorizationFactor::Disabled() == VF)
8328     return false;
8329 
8330   LVP.setBestPlan(VF.Width, 1);
8331 
8332   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
8333                          &CM, BFI, PSI);
8334   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
8335                     << L->getHeader()->getParent()->getName() << "\"\n");
8336   LVP.executePlan(LB, DT);
8337 
8338   // Mark the loop as already vectorized to avoid vectorizing again.
8339   Hints.setAlreadyVectorized();
8340 
8341   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
8342   return true;
8343 }
8344 
8345 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
8346     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
8347                                !EnableLoopInterleaving),
8348       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
8349                               !EnableLoopVectorization) {}
8350 
8351 bool LoopVectorizePass::processLoop(Loop *L) {
8352   assert((EnableVPlanNativePath || L->isInnermost()) &&
8353          "VPlan-native path is not enabled. Only process inner loops.");
8354 
8355 #ifndef NDEBUG
8356   const std::string DebugLocStr = getDebugLocString(L);
8357 #endif /* NDEBUG */
8358 
8359   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
8360                     << L->getHeader()->getParent()->getName() << "\" from "
8361                     << DebugLocStr << "\n");
8362 
8363   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
8364 
8365   LLVM_DEBUG(
8366       dbgs() << "LV: Loop hints:"
8367              << " force="
8368              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
8369                      ? "disabled"
8370                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
8371                             ? "enabled"
8372                             : "?"))
8373              << " width=" << Hints.getWidth()
8374              << " unroll=" << Hints.getInterleave() << "\n");
8375 
8376   // Function containing loop
8377   Function *F = L->getHeader()->getParent();
8378 
8379   // Looking at the diagnostic output is the only way to determine if a loop
8380   // was vectorized (other than looking at the IR or machine code), so it
8381   // is important to generate an optimization remark for each loop. Most of
8382   // these messages are generated as OptimizationRemarkAnalysis. Remarks
8383   // generated as OptimizationRemark and OptimizationRemarkMissed are
8384   // less verbose reporting vectorized loops and unvectorized loops that may
8385   // benefit from vectorization, respectively.
8386 
8387   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
8388     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
8389     return false;
8390   }
8391 
8392   PredicatedScalarEvolution PSE(*SE, *L);
8393 
8394   // Check if it is legal to vectorize the loop.
8395   LoopVectorizationRequirements Requirements(*ORE);
8396   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
8397                                 &Requirements, &Hints, DB, AC, BFI, PSI);
8398   if (!LVL.canVectorize(EnableVPlanNativePath)) {
8399     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
8400     Hints.emitRemarkWithHints();
8401     return false;
8402   }
8403 
8404   // Check the function attributes and profiles to find out if this function
8405   // should be optimized for size.
8406   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
8407       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
8408 
8409   // Entrance to the VPlan-native vectorization path. Outer loops are processed
8410   // here. They may require CFG and instruction level transformations before
8411   // even evaluating whether vectorization is profitable. Since we cannot modify
8412   // the incoming IR, we need to build VPlan upfront in the vectorization
8413   // pipeline.
8414   if (!L->isInnermost())
8415     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
8416                                         ORE, BFI, PSI, Hints);
8417 
8418   assert(L->isInnermost() && "Inner loop expected.");
8419 
8420   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
8421   // count by optimizing for size, to minimize overheads.
8422   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
8423   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
8424     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
8425                       << "This loop is worth vectorizing only if no scalar "
8426                       << "iteration overheads are incurred.");
8427     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
8428       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
8429     else {
8430       LLVM_DEBUG(dbgs() << "\n");
8431       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
8432     }
8433   }
8434 
8435   // Check the function attributes to see if implicit floats are allowed.
8436   // FIXME: This check doesn't seem possibly correct -- what if the loop is
8437   // an integer loop and the vector instructions selected are purely integer
8438   // vector instructions?
8439   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
8440     reportVectorizationFailure(
8441         "Can't vectorize when the NoImplicitFloat attribute is used",
8442         "loop not vectorized due to NoImplicitFloat attribute",
8443         "NoImplicitFloat", ORE, L);
8444     Hints.emitRemarkWithHints();
8445     return false;
8446   }
8447 
8448   // Check if the target supports potentially unsafe FP vectorization.
8449   // FIXME: Add a check for the type of safety issue (denormal, signaling)
8450   // for the target we're vectorizing for, to make sure none of the
8451   // additional fp-math flags can help.
8452   if (Hints.isPotentiallyUnsafe() &&
8453       TTI->isFPVectorizationPotentiallyUnsafe()) {
8454     reportVectorizationFailure(
8455         "Potentially unsafe FP op prevents vectorization",
8456         "loop not vectorized due to unsafe FP support.",
8457         "UnsafeFP", ORE, L);
8458     Hints.emitRemarkWithHints();
8459     return false;
8460   }
8461 
8462   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
8463   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
8464 
8465   // If an override option has been passed in for interleaved accesses, use it.
8466   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
8467     UseInterleaved = EnableInterleavedMemAccesses;
8468 
8469   // Analyze interleaved memory accesses.
8470   if (UseInterleaved) {
8471     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
8472   }
8473 
8474   // Use the cost model.
8475   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
8476                                 F, &Hints, IAI);
8477   CM.collectValuesToIgnore();
8478 
8479   // Use the planner for vectorization.
8480   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);
8481 
8482   // Get user vectorization factor and interleave count.
8483   unsigned UserVF = Hints.getWidth();
8484   unsigned UserIC = Hints.getInterleave();
8485 
8486   // Plan how to best vectorize, return the best VF and its cost.
8487   Optional<VectorizationFactor> MaybeVF =
8488       LVP.plan(ElementCount::getFixed(UserVF), UserIC);
8489 
8490   VectorizationFactor VF = VectorizationFactor::Disabled();
8491   unsigned IC = 1;
8492 
8493   if (MaybeVF) {
8494     VF = *MaybeVF;
8495     // Select the interleave count.
8496     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
8497   }
8498 
8499   // Identify the diagnostic messages that should be produced.
8500   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
8501   bool VectorizeLoop = true, InterleaveLoop = true;
8502   if (Requirements.doesNotMeet(F, L, Hints)) {
8503     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
8504                          "requirements.\n");
8505     Hints.emitRemarkWithHints();
8506     return false;
8507   }
8508 
8509   if (VF.Width.isScalar()) {
8510     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
8511     VecDiagMsg = std::make_pair(
8512         "VectorizationNotBeneficial",
8513         "the cost-model indicates that vectorization is not beneficial");
8514     VectorizeLoop = false;
8515   }
8516 
8517   if (!MaybeVF && UserIC > 1) {
8518     // Tell the user interleaving was avoided up-front, despite being explicitly
8519     // requested.
8520     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
8521                          "interleaving should be avoided up front\n");
8522     IntDiagMsg = std::make_pair(
8523         "InterleavingAvoided",
8524         "Ignoring UserIC, because interleaving was avoided up front");
8525     InterleaveLoop = false;
8526   } else if (IC == 1 && UserIC <= 1) {
8527     // Tell the user interleaving is not beneficial.
8528     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
8529     IntDiagMsg = std::make_pair(
8530         "InterleavingNotBeneficial",
8531         "the cost-model indicates that interleaving is not beneficial");
8532     InterleaveLoop = false;
8533     if (UserIC == 1) {
8534       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
8535       IntDiagMsg.second +=
8536           " and is explicitly disabled or interleave count is set to 1";
8537     }
8538   } else if (IC > 1 && UserIC == 1) {
8539     // Tell the user interleaving is beneficial, but it explicitly disabled.
8540     LLVM_DEBUG(
8541         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
8542     IntDiagMsg = std::make_pair(
8543         "InterleavingBeneficialButDisabled",
8544         "the cost-model indicates that interleaving is beneficial "
8545         "but is explicitly disabled or interleave count is set to 1");
8546     InterleaveLoop = false;
8547   }
8548 
8549   // Override IC if user provided an interleave count.
8550   IC = UserIC > 0 ? UserIC : IC;
8551 
8552   // Emit diagnostic messages, if any.
8553   const char *VAPassName = Hints.vectorizeAnalysisPassName();
8554   if (!VectorizeLoop && !InterleaveLoop) {
8555     // Do not vectorize or interleaving the loop.
8556     ORE->emit([&]() {
8557       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
8558                                       L->getStartLoc(), L->getHeader())
8559              << VecDiagMsg.second;
8560     });
8561     ORE->emit([&]() {
8562       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
8563                                       L->getStartLoc(), L->getHeader())
8564              << IntDiagMsg.second;
8565     });
8566     return false;
8567   } else if (!VectorizeLoop && InterleaveLoop) {
8568     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
8569     ORE->emit([&]() {
8570       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
8571                                         L->getStartLoc(), L->getHeader())
8572              << VecDiagMsg.second;
8573     });
8574   } else if (VectorizeLoop && !InterleaveLoop) {
8575     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
8576                       << ") in " << DebugLocStr << '\n');
8577     ORE->emit([&]() {
8578       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
8579                                         L->getStartLoc(), L->getHeader())
8580              << IntDiagMsg.second;
8581     });
8582   } else if (VectorizeLoop && InterleaveLoop) {
8583     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
8584                       << ") in " << DebugLocStr << '\n');
8585     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
8586   }
8587 
8588   LVP.setBestPlan(VF.Width, IC);
8589 
8590   using namespace ore;
8591   bool DisableRuntimeUnroll = false;
8592   MDNode *OrigLoopID = L->getLoopID();
8593 
8594   if (!VectorizeLoop) {
8595     assert(IC > 1 && "interleave count should not be 1 or 0");
8596     // If we decided that it is not legal to vectorize the loop, then
8597     // interleave it.
8598     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM,
8599                                BFI, PSI);
8600     LVP.executePlan(Unroller, DT);
8601 
8602     ORE->emit([&]() {
8603       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
8604                                 L->getHeader())
8605              << "interleaved loop (interleaved count: "
8606              << NV("InterleaveCount", IC) << ")";
8607     });
8608   } else {
8609     // If we decided that it is *legal* to vectorize the loop, then do it.
8610     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
8611                            &LVL, &CM, BFI, PSI);
8612     LVP.executePlan(LB, DT);
8613     ++LoopsVectorized;
8614 
8615     // Add metadata to disable runtime unrolling a scalar loop when there are
8616     // no runtime checks about strides and memory. A scalar loop that is
8617     // rarely used is not worth unrolling.
8618     if (!LB.areSafetyChecksAdded())
8619       DisableRuntimeUnroll = true;
8620 
8621     // Report the vectorization decision.
8622     ORE->emit([&]() {
8623       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
8624                                 L->getHeader())
8625              << "vectorized loop (vectorization width: "
8626              << NV("VectorizationFactor", VF.Width)
8627              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
8628     });
8629   }
8630 
8631   Optional<MDNode *> RemainderLoopID =
8632       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
8633                                       LLVMLoopVectorizeFollowupEpilogue});
8634   if (RemainderLoopID.hasValue()) {
8635     L->setLoopID(RemainderLoopID.getValue());
8636   } else {
8637     if (DisableRuntimeUnroll)
8638       AddRuntimeUnrollDisableMetaData(L);
8639 
8640     // Mark the loop as already vectorized to avoid vectorizing again.
8641     Hints.setAlreadyVectorized();
8642   }
8643 
8644   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
8645   return true;
8646 }
8647 
8648 LoopVectorizeResult LoopVectorizePass::runImpl(
8649     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
8650     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
8651     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
8652     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
8653     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
8654   SE = &SE_;
8655   LI = &LI_;
8656   TTI = &TTI_;
8657   DT = &DT_;
8658   BFI = &BFI_;
8659   TLI = TLI_;
8660   AA = &AA_;
8661   AC = &AC_;
8662   GetLAA = &GetLAA_;
8663   DB = &DB_;
8664   ORE = &ORE_;
8665   PSI = PSI_;
8666 
8667   // Don't attempt if
8668   // 1. the target claims to have no vector registers, and
8669   // 2. interleaving won't help ILP.
8670   //
8671   // The second condition is necessary because, even if the target has no
8672   // vector registers, loop vectorization may still enable scalar
8673   // interleaving.
8674   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
8675       TTI->getMaxInterleaveFactor(1) < 2)
8676     return LoopVectorizeResult(false, false);
8677 
8678   bool Changed = false, CFGChanged = false;
8679 
8680   // The vectorizer requires loops to be in simplified form.
8681   // Since simplification may add new inner loops, it has to run before the
8682   // legality and profitability checks. This means running the loop vectorizer
8683   // will simplify all loops, regardless of whether anything end up being
8684   // vectorized.
8685   for (auto &L : *LI)
8686     Changed |= CFGChanged |=
8687         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
8688 
8689   // Build up a worklist of inner-loops to vectorize. This is necessary as
8690   // the act of vectorizing or partially unrolling a loop creates new loops
8691   // and can invalidate iterators across the loops.
8692   SmallVector<Loop *, 8> Worklist;
8693 
8694   for (Loop *L : *LI)
8695     collectSupportedLoops(*L, LI, ORE, Worklist);
8696 
8697   LoopsAnalyzed += Worklist.size();
8698 
8699   // Now walk the identified inner loops.
8700   while (!Worklist.empty()) {
8701     Loop *L = Worklist.pop_back_val();
8702 
8703     // For the inner loops we actually process, form LCSSA to simplify the
8704     // transform.
8705     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
8706 
8707     Changed |= CFGChanged |= processLoop(L);
8708   }
8709 
8710   // Process each loop nest in the function.
8711   return LoopVectorizeResult(Changed, CFGChanged);
8712 }
8713 
8714 PreservedAnalyses LoopVectorizePass::run(Function &F,
8715                                          FunctionAnalysisManager &AM) {
8716     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
8717     auto &LI = AM.getResult<LoopAnalysis>(F);
8718     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
8719     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
8720     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
8721     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
8722     auto &AA = AM.getResult<AAManager>(F);
8723     auto &AC = AM.getResult<AssumptionAnalysis>(F);
8724     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
8725     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
8726     MemorySSA *MSSA = EnableMSSALoopDependency
8727                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
8728                           : nullptr;
8729 
8730     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
8731     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
8732         [&](Loop &L) -> const LoopAccessInfo & {
8733       LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,  SE,
8734                                         TLI, TTI, nullptr, MSSA};
8735       return LAM.getResult<LoopAccessAnalysis>(L, AR);
8736     };
8737     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
8738     ProfileSummaryInfo *PSI =
8739         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
8740     LoopVectorizeResult Result =
8741         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
8742     if (!Result.MadeAnyChange)
8743       return PreservedAnalyses::all();
8744     PreservedAnalyses PA;
8745 
8746     // We currently do not preserve loopinfo/dominator analyses with outer loop
8747     // vectorization. Until this is addressed, mark these analyses as preserved
8748     // only for non-VPlan-native path.
8749     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
8750     if (!EnableVPlanNativePath) {
8751       PA.preserve<LoopAnalysis>();
8752       PA.preserve<DominatorTreeAnalysis>();
8753     }
8754     PA.preserve<BasicAA>();
8755     PA.preserve<GlobalsAA>();
8756     if (!Result.MadeCFGChange)
8757       PA.preserveSet<CFGAnalyses>();
8758     return PA;
8759 }
8760