1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
95 #include "llvm/Analysis/TargetLibraryInfo.h"
96 #include "llvm/Analysis/TargetTransformInfo.h"
97 #include "llvm/Analysis/VectorUtils.h"
98 #include "llvm/IR/Attributes.h"
99 #include "llvm/IR/BasicBlock.h"
100 #include "llvm/IR/CFG.h"
101 #include "llvm/IR/Constant.h"
102 #include "llvm/IR/Constants.h"
103 #include "llvm/IR/DataLayout.h"
104 #include "llvm/IR/DebugInfoMetadata.h"
105 #include "llvm/IR/DebugLoc.h"
106 #include "llvm/IR/DerivedTypes.h"
107 #include "llvm/IR/DiagnosticInfo.h"
108 #include "llvm/IR/Dominators.h"
109 #include "llvm/IR/Function.h"
110 #include "llvm/IR/IRBuilder.h"
111 #include "llvm/IR/InstrTypes.h"
112 #include "llvm/IR/Instruction.h"
113 #include "llvm/IR/Instructions.h"
114 #include "llvm/IR/IntrinsicInst.h"
115 #include "llvm/IR/Intrinsics.h"
116 #include "llvm/IR/LLVMContext.h"
117 #include "llvm/IR/Metadata.h"
118 #include "llvm/IR/Module.h"
119 #include "llvm/IR/Operator.h"
120 #include "llvm/IR/Type.h"
121 #include "llvm/IR/Use.h"
122 #include "llvm/IR/User.h"
123 #include "llvm/IR/Value.h"
124 #include "llvm/IR/ValueHandle.h"
125 #include "llvm/IR/Verifier.h"
126 #include "llvm/InitializePasses.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
137 #include "llvm/Transforms/Utils/LoopSimplify.h"
138 #include "llvm/Transforms/Utils/LoopUtils.h"
139 #include "llvm/Transforms/Utils/LoopVersioning.h"
140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143 #include <algorithm>
144 #include <cassert>
145 #include <cstdint>
146 #include <cstdlib>
147 #include <functional>
148 #include <iterator>
149 #include <limits>
150 #include <memory>
151 #include <string>
152 #include <tuple>
153 #include <utility>
154 
155 using namespace llvm;
156 
157 #define LV_NAME "loop-vectorize"
158 #define DEBUG_TYPE LV_NAME
159 
160 /// @{
161 /// Metadata attribute names
162 static const char *const LLVMLoopVectorizeFollowupAll =
163     "llvm.loop.vectorize.followup_all";
164 static const char *const LLVMLoopVectorizeFollowupVectorized =
165     "llvm.loop.vectorize.followup_vectorized";
166 static const char *const LLVMLoopVectorizeFollowupEpilogue =
167     "llvm.loop.vectorize.followup_epilogue";
168 /// @}
169 
170 STATISTIC(LoopsVectorized, "Number of loops vectorized");
171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
172 
173 /// Loops with a known constant trip count below this number are vectorized only
174 /// if no scalar iteration overheads are incurred.
175 static cl::opt<unsigned> TinyTripCountVectorThreshold(
176     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
177     cl::desc("Loops with a constant trip count that is smaller than this "
178              "value are vectorized only if no scalar iteration overheads "
179              "are incurred."));
180 
181 // Indicates that an epilogue is undesired, predication is preferred.
182 // This means that the vectorizer will try to fold the loop-tail (epilogue)
183 // into the loop and predicate the loop body accordingly.
184 static cl::opt<bool> PreferPredicateOverEpilog(
185     "prefer-predicate-over-epilog", cl::init(false), cl::Hidden,
186     cl::desc("Indicate that an epilogue is undesired, predication should be "
187              "used instead."));
188 
189 static cl::opt<bool> MaximizeBandwidth(
190     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
191     cl::desc("Maximize bandwidth when selecting vectorization factor which "
192              "will be determined by the smallest type in loop."));
193 
194 static cl::opt<bool> EnableInterleavedMemAccesses(
195     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
196     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
197 
198 /// An interleave-group may need masking if it resides in a block that needs
199 /// predication, or in order to mask away gaps.
200 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
201     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
202     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
203 
204 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
205     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
206     cl::desc("We don't interleave loops with a estimated constant trip count "
207              "below this number"));
208 
209 static cl::opt<unsigned> ForceTargetNumScalarRegs(
210     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
211     cl::desc("A flag that overrides the target's number of scalar registers."));
212 
213 static cl::opt<unsigned> ForceTargetNumVectorRegs(
214     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
215     cl::desc("A flag that overrides the target's number of vector registers."));
216 
217 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
218     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
219     cl::desc("A flag that overrides the target's max interleave factor for "
220              "scalar loops."));
221 
222 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
223     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
224     cl::desc("A flag that overrides the target's max interleave factor for "
225              "vectorized loops."));
226 
227 static cl::opt<unsigned> ForceTargetInstructionCost(
228     "force-target-instruction-cost", cl::init(0), cl::Hidden,
229     cl::desc("A flag that overrides the target's expected cost for "
230              "an instruction to a single constant value. Mostly "
231              "useful for getting consistent testing."));
232 
233 static cl::opt<unsigned> SmallLoopCost(
234     "small-loop-cost", cl::init(20), cl::Hidden,
235     cl::desc(
236         "The cost of a loop that is considered 'small' by the interleaver."));
237 
238 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
239     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
240     cl::desc("Enable the use of the block frequency analysis to access PGO "
241              "heuristics minimizing code growth in cold regions and being more "
242              "aggressive in hot regions."));
243 
244 // Runtime interleave loops for load/store throughput.
245 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
246     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
247     cl::desc(
248         "Enable runtime interleaving until load/store ports are saturated"));
249 
250 /// The number of stores in a loop that are allowed to need predication.
251 static cl::opt<unsigned> NumberOfStoresToPredicate(
252     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
253     cl::desc("Max number of stores to be predicated behind an if."));
254 
255 static cl::opt<bool> EnableIndVarRegisterHeur(
256     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
257     cl::desc("Count the induction variable only once when interleaving"));
258 
259 static cl::opt<bool> EnableCondStoresVectorization(
260     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
261     cl::desc("Enable if predication of stores during vectorization."));
262 
263 static cl::opt<unsigned> MaxNestedScalarReductionIC(
264     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
265     cl::desc("The maximum interleave count to use when interleaving a scalar "
266              "reduction in a nested loop."));
267 
268 cl::opt<bool> EnableVPlanNativePath(
269     "enable-vplan-native-path", cl::init(false), cl::Hidden,
270     cl::desc("Enable VPlan-native vectorization path with "
271              "support for outer loop vectorization."));
272 
273 // FIXME: Remove this switch once we have divergence analysis. Currently we
274 // assume divergent non-backedge branches when this switch is true.
275 cl::opt<bool> EnableVPlanPredication(
276     "enable-vplan-predication", cl::init(false), cl::Hidden,
277     cl::desc("Enable VPlan-native vectorization path predicator with "
278              "support for outer loop vectorization."));
279 
280 // This flag enables the stress testing of the VPlan H-CFG construction in the
281 // VPlan-native vectorization path. It must be used in conjuction with
282 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
283 // verification of the H-CFGs built.
284 static cl::opt<bool> VPlanBuildStressTest(
285     "vplan-build-stress-test", cl::init(false), cl::Hidden,
286     cl::desc(
287         "Build VPlan for every supported loop nest in the function and bail "
288         "out right after the build (stress test the VPlan H-CFG construction "
289         "in the VPlan-native vectorization path)."));
290 
291 cl::opt<bool> llvm::EnableLoopInterleaving(
292     "interleave-loops", cl::init(true), cl::Hidden,
293     cl::desc("Enable loop interleaving in Loop vectorization passes"));
294 cl::opt<bool> llvm::EnableLoopVectorization(
295     "vectorize-loops", cl::init(true), cl::Hidden,
296     cl::desc("Run the Loop vectorization passes"));
297 
298 /// A helper function that returns the type of loaded or stored value.
299 static Type *getMemInstValueType(Value *I) {
300   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
301          "Expected Load or Store instruction");
302   if (auto *LI = dyn_cast<LoadInst>(I))
303     return LI->getType();
304   return cast<StoreInst>(I)->getValueOperand()->getType();
305 }
306 
307 /// A helper function that returns true if the given type is irregular. The
308 /// type is irregular if its allocated size doesn't equal the store size of an
309 /// element of the corresponding vector type at the given vectorization factor.
310 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
311   // Determine if an array of VF elements of type Ty is "bitcast compatible"
312   // with a <VF x Ty> vector.
313   if (VF > 1) {
314     auto *VectorTy = FixedVectorType::get(Ty, VF);
315     return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
316   }
317 
318   // If the vectorization factor is one, we just check if an array of type Ty
319   // requires padding between elements.
320   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
321 }
322 
323 /// A helper function that returns the reciprocal of the block probability of
324 /// predicated blocks. If we return X, we are assuming the predicated block
325 /// will execute once for every X iterations of the loop header.
326 ///
327 /// TODO: We should use actual block probability here, if available. Currently,
328 ///       we always assume predicated blocks have a 50% chance of executing.
329 static unsigned getReciprocalPredBlockProb() { return 2; }
330 
331 /// A helper function that adds a 'fast' flag to floating-point operations.
332 static Value *addFastMathFlag(Value *V) {
333   if (isa<FPMathOperator>(V))
334     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
335   return V;
336 }
337 
338 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
339   if (isa<FPMathOperator>(V))
340     cast<Instruction>(V)->setFastMathFlags(FMF);
341   return V;
342 }
343 
344 /// A helper function that returns an integer or floating-point constant with
345 /// value C.
346 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
347   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
348                            : ConstantFP::get(Ty, C);
349 }
350 
351 /// Returns "best known" trip count for the specified loop \p L as defined by
352 /// the following procedure:
353 ///   1) Returns exact trip count if it is known.
354 ///   2) Returns expected trip count according to profile data if any.
355 ///   3) Returns upper bound estimate if it is known.
356 ///   4) Returns None if all of the above failed.
357 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
358   // Check if exact trip count is known.
359   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
360     return ExpectedTC;
361 
362   // Check if there is an expected trip count available from profile data.
363   if (LoopVectorizeWithBlockFrequency)
364     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
365       return EstimatedTC;
366 
367   // Check if upper bound estimate is known.
368   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
369     return ExpectedTC;
370 
371   return None;
372 }
373 
374 namespace llvm {
375 
376 /// InnerLoopVectorizer vectorizes loops which contain only one basic
377 /// block to a specified vectorization factor (VF).
378 /// This class performs the widening of scalars into vectors, or multiple
379 /// scalars. This class also implements the following features:
380 /// * It inserts an epilogue loop for handling loops that don't have iteration
381 ///   counts that are known to be a multiple of the vectorization factor.
382 /// * It handles the code generation for reduction variables.
383 /// * Scalarization (implementation using scalars) of un-vectorizable
384 ///   instructions.
385 /// InnerLoopVectorizer does not perform any vectorization-legality
386 /// checks, and relies on the caller to check for the different legality
387 /// aspects. The InnerLoopVectorizer relies on the
388 /// LoopVectorizationLegality class to provide information about the induction
389 /// and reduction variables that were found to a given vectorization factor.
390 class InnerLoopVectorizer {
391 public:
392   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
393                       LoopInfo *LI, DominatorTree *DT,
394                       const TargetLibraryInfo *TLI,
395                       const TargetTransformInfo *TTI, AssumptionCache *AC,
396                       OptimizationRemarkEmitter *ORE, unsigned VecWidth,
397                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
398                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
399                       ProfileSummaryInfo *PSI)
400       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
401         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
402         Builder(PSE.getSE()->getContext()),
403         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM),
404         BFI(BFI), PSI(PSI) {
405     // Query this against the original loop and save it here because the profile
406     // of the original loop header may change as the transformation happens.
407     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
408         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
409   }
410 
411   virtual ~InnerLoopVectorizer() = default;
412 
413   /// Create a new empty loop that will contain vectorized instructions later
414   /// on, while the old loop will be used as the scalar remainder. Control flow
415   /// is generated around the vectorized (and scalar epilogue) loops consisting
416   /// of various checks and bypasses. Return the pre-header block of the new
417   /// loop.
418   BasicBlock *createVectorizedLoopSkeleton();
419 
420   /// Widen a single instruction within the innermost loop.
421   void widenInstruction(Instruction &I, VPUser &Operands,
422                         VPTransformState &State);
423 
424   /// Widen a single call instruction within the innermost loop.
425   void widenCallInstruction(CallInst &I, VPUser &ArgOperands,
426                             VPTransformState &State);
427 
428   /// Widen a single select instruction within the innermost loop.
429   void widenSelectInstruction(SelectInst &I, VPUser &Operands,
430                               bool InvariantCond, VPTransformState &State);
431 
432   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
433   void fixVectorizedLoop();
434 
435   // Return true if any runtime check is added.
436   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
437 
438   /// A type for vectorized values in the new loop. Each value from the
439   /// original loop, when vectorized, is represented by UF vector values in the
440   /// new unrolled loop, where UF is the unroll factor.
441   using VectorParts = SmallVector<Value *, 2>;
442 
443   /// Vectorize a single GetElementPtrInst based on information gathered and
444   /// decisions taken during planning.
445   void widenGEP(GetElementPtrInst *GEP, VPUser &Indices, unsigned UF,
446                 unsigned VF, bool IsPtrLoopInvariant,
447                 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
448 
449   /// Vectorize a single PHINode in a block. This method handles the induction
450   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
451   /// arbitrary length vectors.
452   void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
453 
454   /// A helper function to scalarize a single Instruction in the innermost loop.
455   /// Generates a sequence of scalar instances for each lane between \p MinLane
456   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
457   /// inclusive. Uses the VPValue operands from \p Operands instead of \p
458   /// Instr's operands.
459   void scalarizeInstruction(Instruction *Instr, VPUser &Operands,
460                             const VPIteration &Instance, bool IfPredicateInstr,
461                             VPTransformState &State);
462 
463   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
464   /// is provided, the integer induction variable will first be truncated to
465   /// the corresponding type.
466   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
467 
468   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
469   /// vector or scalar value on-demand if one is not yet available. When
470   /// vectorizing a loop, we visit the definition of an instruction before its
471   /// uses. When visiting the definition, we either vectorize or scalarize the
472   /// instruction, creating an entry for it in the corresponding map. (In some
473   /// cases, such as induction variables, we will create both vector and scalar
474   /// entries.) Then, as we encounter uses of the definition, we derive values
475   /// for each scalar or vector use unless such a value is already available.
476   /// For example, if we scalarize a definition and one of its uses is vector,
477   /// we build the required vector on-demand with an insertelement sequence
478   /// when visiting the use. Otherwise, if the use is scalar, we can use the
479   /// existing scalar definition.
480   ///
481   /// Return a value in the new loop corresponding to \p V from the original
482   /// loop at unroll index \p Part. If the value has already been vectorized,
483   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
484   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
485   /// a new vector value on-demand by inserting the scalar values into a vector
486   /// with an insertelement sequence. If the value has been neither vectorized
487   /// nor scalarized, it must be loop invariant, so we simply broadcast the
488   /// value into a vector.
489   Value *getOrCreateVectorValue(Value *V, unsigned Part);
490 
491   /// Return a value in the new loop corresponding to \p V from the original
492   /// loop at unroll and vector indices \p Instance. If the value has been
493   /// vectorized but not scalarized, the necessary extractelement instruction
494   /// will be generated.
495   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
496 
497   /// Construct the vector value of a scalarized value \p V one lane at a time.
498   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
499 
500   /// Try to vectorize interleaved access group \p Group with the base address
501   /// given in \p Addr, optionally masking the vector operations if \p
502   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
503   /// values in the vectorized loop.
504   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
505                                 VPTransformState &State, VPValue *Addr,
506                                 VPValue *BlockInMask = nullptr);
507 
508   /// Vectorize Load and Store instructions with the base address given in \p
509   /// Addr, optionally masking the vector operations if \p BlockInMask is
510   /// non-null. Use \p State to translate given VPValues to IR values in the
511   /// vectorized loop.
512   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
513                                   VPValue *Addr, VPValue *StoredValue,
514                                   VPValue *BlockInMask);
515 
516   /// Set the debug location in the builder using the debug location in
517   /// the instruction.
518   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
519 
520   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
521   void fixNonInductionPHIs(void);
522 
523 protected:
524   friend class LoopVectorizationPlanner;
525 
526   /// A small list of PHINodes.
527   using PhiVector = SmallVector<PHINode *, 4>;
528 
529   /// A type for scalarized values in the new loop. Each value from the
530   /// original loop, when scalarized, is represented by UF x VF scalar values
531   /// in the new unrolled loop, where UF is the unroll factor and VF is the
532   /// vectorization factor.
533   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
534 
535   /// Set up the values of the IVs correctly when exiting the vector loop.
536   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
537                     Value *CountRoundDown, Value *EndValue,
538                     BasicBlock *MiddleBlock);
539 
540   /// Create a new induction variable inside L.
541   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
542                                    Value *Step, Instruction *DL);
543 
544   /// Handle all cross-iteration phis in the header.
545   void fixCrossIterationPHIs();
546 
547   /// Fix a first-order recurrence. This is the second phase of vectorizing
548   /// this phi node.
549   void fixFirstOrderRecurrence(PHINode *Phi);
550 
551   /// Fix a reduction cross-iteration phi. This is the second phase of
552   /// vectorizing this phi node.
553   void fixReduction(PHINode *Phi);
554 
555   /// Clear NSW/NUW flags from reduction instructions if necessary.
556   void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
557 
558   /// The Loop exit block may have single value PHI nodes with some
559   /// incoming value. While vectorizing we only handled real values
560   /// that were defined inside the loop and we should have one value for
561   /// each predecessor of its parent basic block. See PR14725.
562   void fixLCSSAPHIs();
563 
564   /// Iteratively sink the scalarized operands of a predicated instruction into
565   /// the block that was created for it.
566   void sinkScalarOperands(Instruction *PredInst);
567 
568   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
569   /// represented as.
570   void truncateToMinimalBitwidths();
571 
572   /// Create a broadcast instruction. This method generates a broadcast
573   /// instruction (shuffle) for loop invariant values and for the induction
574   /// value. If this is the induction variable then we extend it to N, N+1, ...
575   /// this is needed because each iteration in the loop corresponds to a SIMD
576   /// element.
577   virtual Value *getBroadcastInstrs(Value *V);
578 
579   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
580   /// to each vector element of Val. The sequence starts at StartIndex.
581   /// \p Opcode is relevant for FP induction variable.
582   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
583                                Instruction::BinaryOps Opcode =
584                                Instruction::BinaryOpsEnd);
585 
586   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
587   /// variable on which to base the steps, \p Step is the size of the step, and
588   /// \p EntryVal is the value from the original loop that maps to the steps.
589   /// Note that \p EntryVal doesn't have to be an induction variable - it
590   /// can also be a truncate instruction.
591   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
592                         const InductionDescriptor &ID);
593 
594   /// Create a vector induction phi node based on an existing scalar one. \p
595   /// EntryVal is the value from the original loop that maps to the vector phi
596   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
597   /// truncate instruction, instead of widening the original IV, we widen a
598   /// version of the IV truncated to \p EntryVal's type.
599   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
600                                        Value *Step, Instruction *EntryVal);
601 
602   /// Returns true if an instruction \p I should be scalarized instead of
603   /// vectorized for the chosen vectorization factor.
604   bool shouldScalarizeInstruction(Instruction *I) const;
605 
606   /// Returns true if we should generate a scalar version of \p IV.
607   bool needsScalarInduction(Instruction *IV) const;
608 
609   /// If there is a cast involved in the induction variable \p ID, which should
610   /// be ignored in the vectorized loop body, this function records the
611   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
612   /// cast. We had already proved that the casted Phi is equal to the uncasted
613   /// Phi in the vectorized loop (under a runtime guard), and therefore
614   /// there is no need to vectorize the cast - the same value can be used in the
615   /// vector loop for both the Phi and the cast.
616   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
617   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
618   ///
619   /// \p EntryVal is the value from the original loop that maps to the vector
620   /// phi node and is used to distinguish what is the IV currently being
621   /// processed - original one (if \p EntryVal is a phi corresponding to the
622   /// original IV) or the "newly-created" one based on the proof mentioned above
623   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
624   /// latter case \p EntryVal is a TruncInst and we must not record anything for
625   /// that IV, but it's error-prone to expect callers of this routine to care
626   /// about that, hence this explicit parameter.
627   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
628                                              const Instruction *EntryVal,
629                                              Value *VectorLoopValue,
630                                              unsigned Part,
631                                              unsigned Lane = UINT_MAX);
632 
633   /// Generate a shuffle sequence that will reverse the vector Vec.
634   virtual Value *reverseVector(Value *Vec);
635 
636   /// Returns (and creates if needed) the original loop trip count.
637   Value *getOrCreateTripCount(Loop *NewLoop);
638 
639   /// Returns (and creates if needed) the trip count of the widened loop.
640   Value *getOrCreateVectorTripCount(Loop *NewLoop);
641 
642   /// Returns a bitcasted value to the requested vector type.
643   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
644   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
645                                 const DataLayout &DL);
646 
647   /// Emit a bypass check to see if the vector trip count is zero, including if
648   /// it overflows.
649   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
650 
651   /// Emit a bypass check to see if all of the SCEV assumptions we've
652   /// had to make are correct.
653   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
654 
655   /// Emit bypass checks to check any memory assumptions we may have made.
656   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
657 
658   /// Compute the transformed value of Index at offset StartValue using step
659   /// StepValue.
660   /// For integer induction, returns StartValue + Index * StepValue.
661   /// For pointer induction, returns StartValue[Index * StepValue].
662   /// FIXME: The newly created binary instructions should contain nsw/nuw
663   /// flags, which can be found from the original scalar operations.
664   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
665                               const DataLayout &DL,
666                               const InductionDescriptor &ID) const;
667 
668   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
669   /// vector loop preheader, middle block and scalar preheader. Also
670   /// allocate a loop object for the new vector loop and return it.
671   Loop *createVectorLoopSkeleton(StringRef Prefix);
672 
673   /// Create new phi nodes for the induction variables to resume iteration count
674   /// in the scalar epilogue, from where the vectorized loop left off (given by
675   /// \p VectorTripCount).
676   void createInductionResumeValues(Loop *L, Value *VectorTripCount);
677 
678   /// Complete the loop skeleton by adding debug MDs, creating appropriate
679   /// conditional branches in the middle block, preparing the builder and
680   /// running the verifier. Take in the vector loop \p L as argument, and return
681   /// the preheader of the completed vector loop.
682   BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
683 
684   /// Add additional metadata to \p To that was not present on \p Orig.
685   ///
686   /// Currently this is used to add the noalias annotations based on the
687   /// inserted memchecks.  Use this for instructions that are *cloned* into the
688   /// vector loop.
689   void addNewMetadata(Instruction *To, const Instruction *Orig);
690 
691   /// Add metadata from one instruction to another.
692   ///
693   /// This includes both the original MDs from \p From and additional ones (\see
694   /// addNewMetadata).  Use this for *newly created* instructions in the vector
695   /// loop.
696   void addMetadata(Instruction *To, Instruction *From);
697 
698   /// Similar to the previous function but it adds the metadata to a
699   /// vector of instructions.
700   void addMetadata(ArrayRef<Value *> To, Instruction *From);
701 
702   /// The original loop.
703   Loop *OrigLoop;
704 
705   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
706   /// dynamic knowledge to simplify SCEV expressions and converts them to a
707   /// more usable form.
708   PredicatedScalarEvolution &PSE;
709 
710   /// Loop Info.
711   LoopInfo *LI;
712 
713   /// Dominator Tree.
714   DominatorTree *DT;
715 
716   /// Alias Analysis.
717   AAResults *AA;
718 
719   /// Target Library Info.
720   const TargetLibraryInfo *TLI;
721 
722   /// Target Transform Info.
723   const TargetTransformInfo *TTI;
724 
725   /// Assumption Cache.
726   AssumptionCache *AC;
727 
728   /// Interface to emit optimization remarks.
729   OptimizationRemarkEmitter *ORE;
730 
731   /// LoopVersioning.  It's only set up (non-null) if memchecks were
732   /// used.
733   ///
734   /// This is currently only used to add no-alias metadata based on the
735   /// memchecks.  The actually versioning is performed manually.
736   std::unique_ptr<LoopVersioning> LVer;
737 
738   /// The vectorization SIMD factor to use. Each vector will have this many
739   /// vector elements.
740   unsigned VF;
741 
742   /// The vectorization unroll factor to use. Each scalar is vectorized to this
743   /// many different vector instructions.
744   unsigned UF;
745 
746   /// The builder that we use
747   IRBuilder<> Builder;
748 
749   // --- Vectorization state ---
750 
751   /// The vector-loop preheader.
752   BasicBlock *LoopVectorPreHeader;
753 
754   /// The scalar-loop preheader.
755   BasicBlock *LoopScalarPreHeader;
756 
757   /// Middle Block between the vector and the scalar.
758   BasicBlock *LoopMiddleBlock;
759 
760   /// The ExitBlock of the scalar loop.
761   BasicBlock *LoopExitBlock;
762 
763   /// The vector loop body.
764   BasicBlock *LoopVectorBody;
765 
766   /// The scalar loop body.
767   BasicBlock *LoopScalarBody;
768 
769   /// A list of all bypass blocks. The first block is the entry of the loop.
770   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
771 
772   /// The new Induction variable which was added to the new block.
773   PHINode *Induction = nullptr;
774 
775   /// The induction variable of the old basic block.
776   PHINode *OldInduction = nullptr;
777 
778   /// Maps values from the original loop to their corresponding values in the
779   /// vectorized loop. A key value can map to either vector values, scalar
780   /// values or both kinds of values, depending on whether the key was
781   /// vectorized and scalarized.
782   VectorizerValueMap VectorLoopValueMap;
783 
784   /// Store instructions that were predicated.
785   SmallVector<Instruction *, 4> PredicatedInstructions;
786 
787   /// Trip count of the original loop.
788   Value *TripCount = nullptr;
789 
790   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
791   Value *VectorTripCount = nullptr;
792 
793   /// The legality analysis.
794   LoopVectorizationLegality *Legal;
795 
796   /// The profitablity analysis.
797   LoopVectorizationCostModel *Cost;
798 
799   // Record whether runtime checks are added.
800   bool AddedSafetyChecks = false;
801 
802   // Holds the end values for each induction variable. We save the end values
803   // so we can later fix-up the external users of the induction variables.
804   DenseMap<PHINode *, Value *> IVEndValues;
805 
806   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
807   // fixed up at the end of vector code generation.
808   SmallVector<PHINode *, 8> OrigPHIsToFix;
809 
810   /// BFI and PSI are used to check for profile guided size optimizations.
811   BlockFrequencyInfo *BFI;
812   ProfileSummaryInfo *PSI;
813 
814   // Whether this loop should be optimized for size based on profile guided size
815   // optimizatios.
816   bool OptForSizeBasedOnProfile;
817 };
818 
819 class InnerLoopUnroller : public InnerLoopVectorizer {
820 public:
821   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
822                     LoopInfo *LI, DominatorTree *DT,
823                     const TargetLibraryInfo *TLI,
824                     const TargetTransformInfo *TTI, AssumptionCache *AC,
825                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
826                     LoopVectorizationLegality *LVL,
827                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
828                     ProfileSummaryInfo *PSI)
829       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
830                             UnrollFactor, LVL, CM, BFI, PSI) {}
831 
832 private:
833   Value *getBroadcastInstrs(Value *V) override;
834   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
835                        Instruction::BinaryOps Opcode =
836                        Instruction::BinaryOpsEnd) override;
837   Value *reverseVector(Value *Vec) override;
838 };
839 
840 } // end namespace llvm
841 
842 /// Look for a meaningful debug location on the instruction or it's
843 /// operands.
844 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
845   if (!I)
846     return I;
847 
848   DebugLoc Empty;
849   if (I->getDebugLoc() != Empty)
850     return I;
851 
852   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
853     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
854       if (OpInst->getDebugLoc() != Empty)
855         return OpInst;
856   }
857 
858   return I;
859 }
860 
861 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
862   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
863     const DILocation *DIL = Inst->getDebugLoc();
864     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
865         !isa<DbgInfoIntrinsic>(Inst)) {
866       auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
867       if (NewDIL)
868         B.SetCurrentDebugLocation(NewDIL.getValue());
869       else
870         LLVM_DEBUG(dbgs()
871                    << "Failed to create new discriminator: "
872                    << DIL->getFilename() << " Line: " << DIL->getLine());
873     }
874     else
875       B.SetCurrentDebugLocation(DIL);
876   } else
877     B.SetCurrentDebugLocation(DebugLoc());
878 }
879 
880 /// Write a record \p DebugMsg about vectorization failure to the debug
881 /// output stream. If \p I is passed, it is an instruction that prevents
882 /// vectorization.
883 #ifndef NDEBUG
884 static void debugVectorizationFailure(const StringRef DebugMsg,
885     Instruction *I) {
886   dbgs() << "LV: Not vectorizing: " << DebugMsg;
887   if (I != nullptr)
888     dbgs() << " " << *I;
889   else
890     dbgs() << '.';
891   dbgs() << '\n';
892 }
893 #endif
894 
895 /// Create an analysis remark that explains why vectorization failed
896 ///
897 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
898 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
899 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
900 /// the location of the remark.  \return the remark object that can be
901 /// streamed to.
902 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
903     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
904   Value *CodeRegion = TheLoop->getHeader();
905   DebugLoc DL = TheLoop->getStartLoc();
906 
907   if (I) {
908     CodeRegion = I->getParent();
909     // If there is no debug location attached to the instruction, revert back to
910     // using the loop's.
911     if (I->getDebugLoc())
912       DL = I->getDebugLoc();
913   }
914 
915   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
916   R << "loop not vectorized: ";
917   return R;
918 }
919 
920 namespace llvm {
921 
922 void reportVectorizationFailure(const StringRef DebugMsg,
923     const StringRef OREMsg, const StringRef ORETag,
924     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
925   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
926   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
927   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
928                 ORETag, TheLoop, I) << OREMsg);
929 }
930 
931 } // end namespace llvm
932 
933 #ifndef NDEBUG
934 /// \return string containing a file name and a line # for the given loop.
935 static std::string getDebugLocString(const Loop *L) {
936   std::string Result;
937   if (L) {
938     raw_string_ostream OS(Result);
939     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
940       LoopDbgLoc.print(OS);
941     else
942       // Just print the module name.
943       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
944     OS.flush();
945   }
946   return Result;
947 }
948 #endif
949 
950 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
951                                          const Instruction *Orig) {
952   // If the loop was versioned with memchecks, add the corresponding no-alias
953   // metadata.
954   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
955     LVer->annotateInstWithNoAlias(To, Orig);
956 }
957 
958 void InnerLoopVectorizer::addMetadata(Instruction *To,
959                                       Instruction *From) {
960   propagateMetadata(To, From);
961   addNewMetadata(To, From);
962 }
963 
964 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
965                                       Instruction *From) {
966   for (Value *V : To) {
967     if (Instruction *I = dyn_cast<Instruction>(V))
968       addMetadata(I, From);
969   }
970 }
971 
972 namespace llvm {
973 
974 // Loop vectorization cost-model hints how the scalar epilogue loop should be
975 // lowered.
976 enum ScalarEpilogueLowering {
977 
978   // The default: allowing scalar epilogues.
979   CM_ScalarEpilogueAllowed,
980 
981   // Vectorization with OptForSize: don't allow epilogues.
982   CM_ScalarEpilogueNotAllowedOptSize,
983 
984   // A special case of vectorisation with OptForSize: loops with a very small
985   // trip count are considered for vectorization under OptForSize, thereby
986   // making sure the cost of their loop body is dominant, free of runtime
987   // guards and scalar iteration overheads.
988   CM_ScalarEpilogueNotAllowedLowTripLoop,
989 
990   // Loop hint predicate indicating an epilogue is undesired.
991   CM_ScalarEpilogueNotNeededUsePredicate
992 };
993 
994 /// LoopVectorizationCostModel - estimates the expected speedups due to
995 /// vectorization.
996 /// In many cases vectorization is not profitable. This can happen because of
997 /// a number of reasons. In this class we mainly attempt to predict the
998 /// expected speedup/slowdowns due to the supported instruction set. We use the
999 /// TargetTransformInfo to query the different backends for the cost of
1000 /// different operations.
1001 class LoopVectorizationCostModel {
1002 public:
1003   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1004                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1005                              LoopVectorizationLegality *Legal,
1006                              const TargetTransformInfo &TTI,
1007                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1008                              AssumptionCache *AC,
1009                              OptimizationRemarkEmitter *ORE, const Function *F,
1010                              const LoopVectorizeHints *Hints,
1011                              InterleavedAccessInfo &IAI)
1012       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1013         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1014         Hints(Hints), InterleaveInfo(IAI) {}
1015 
1016   /// \return An upper bound for the vectorization factor, or None if
1017   /// vectorization and interleaving should be avoided up front.
1018   Optional<unsigned> computeMaxVF(unsigned UserVF, unsigned UserIC);
1019 
1020   /// \return True if runtime checks are required for vectorization, and false
1021   /// otherwise.
1022   bool runtimeChecksRequired();
1023 
1024   /// \return The most profitable vectorization factor and the cost of that VF.
1025   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
1026   /// then this vectorization factor will be selected if vectorization is
1027   /// possible.
1028   VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
1029 
1030   /// Setup cost-based decisions for user vectorization factor.
1031   void selectUserVectorizationFactor(unsigned UserVF) {
1032     collectUniformsAndScalars(UserVF);
1033     collectInstsToScalarize(UserVF);
1034   }
1035 
1036   /// \return The size (in bits) of the smallest and widest types in the code
1037   /// that needs to be vectorized. We ignore values that remain scalar such as
1038   /// 64 bit loop indices.
1039   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1040 
1041   /// \return The desired interleave count.
1042   /// If interleave count has been specified by metadata it will be returned.
1043   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1044   /// are the selected vectorization factor and the cost of the selected VF.
1045   unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);
1046 
1047   /// Memory access instruction may be vectorized in more than one way.
1048   /// Form of instruction after vectorization depends on cost.
1049   /// This function takes cost-based decisions for Load/Store instructions
1050   /// and collects them in a map. This decisions map is used for building
1051   /// the lists of loop-uniform and loop-scalar instructions.
1052   /// The calculated cost is saved with widening decision in order to
1053   /// avoid redundant calculations.
1054   void setCostBasedWideningDecision(unsigned VF);
1055 
1056   /// A struct that represents some properties of the register usage
1057   /// of a loop.
1058   struct RegisterUsage {
1059     /// Holds the number of loop invariant values that are used in the loop.
1060     /// The key is ClassID of target-provided register class.
1061     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1062     /// Holds the maximum number of concurrent live intervals in the loop.
1063     /// The key is ClassID of target-provided register class.
1064     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1065   };
1066 
1067   /// \return Returns information about the register usages of the loop for the
1068   /// given vectorization factors.
1069   SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
1070 
1071   /// Collect values we want to ignore in the cost model.
1072   void collectValuesToIgnore();
1073 
1074   /// \returns The smallest bitwidth each instruction can be represented with.
1075   /// The vector equivalents of these instructions should be truncated to this
1076   /// type.
1077   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1078     return MinBWs;
1079   }
1080 
1081   /// \returns True if it is more profitable to scalarize instruction \p I for
1082   /// vectorization factor \p VF.
1083   bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
1084     assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
1085 
1086     // Cost model is not run in the VPlan-native path - return conservative
1087     // result until this changes.
1088     if (EnableVPlanNativePath)
1089       return false;
1090 
1091     auto Scalars = InstsToScalarize.find(VF);
1092     assert(Scalars != InstsToScalarize.end() &&
1093            "VF not yet analyzed for scalarization profitability");
1094     return Scalars->second.find(I) != Scalars->second.end();
1095   }
1096 
1097   /// Returns true if \p I is known to be uniform after vectorization.
1098   bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
1099     if (VF == 1)
1100       return true;
1101 
1102     // Cost model is not run in the VPlan-native path - return conservative
1103     // result until this changes.
1104     if (EnableVPlanNativePath)
1105       return false;
1106 
1107     auto UniformsPerVF = Uniforms.find(VF);
1108     assert(UniformsPerVF != Uniforms.end() &&
1109            "VF not yet analyzed for uniformity");
1110     return UniformsPerVF->second.count(I);
1111   }
1112 
1113   /// Returns true if \p I is known to be scalar after vectorization.
1114   bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
1115     if (VF == 1)
1116       return true;
1117 
1118     // Cost model is not run in the VPlan-native path - return conservative
1119     // result until this changes.
1120     if (EnableVPlanNativePath)
1121       return false;
1122 
1123     auto ScalarsPerVF = Scalars.find(VF);
1124     assert(ScalarsPerVF != Scalars.end() &&
1125            "Scalar values are not calculated for VF");
1126     return ScalarsPerVF->second.count(I);
1127   }
1128 
1129   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1130   /// for vectorization factor \p VF.
1131   bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
1132     return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
1133            !isProfitableToScalarize(I, VF) &&
1134            !isScalarAfterVectorization(I, VF);
1135   }
1136 
1137   /// Decision that was taken during cost calculation for memory instruction.
1138   enum InstWidening {
1139     CM_Unknown,
1140     CM_Widen,         // For consecutive accesses with stride +1.
1141     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1142     CM_Interleave,
1143     CM_GatherScatter,
1144     CM_Scalarize
1145   };
1146 
1147   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1148   /// instruction \p I and vector width \p VF.
1149   void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
1150                            unsigned Cost) {
1151     assert(VF >= 2 && "Expected VF >=2");
1152     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1153   }
1154 
1155   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1156   /// interleaving group \p Grp and vector width \p VF.
1157   void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
1158                            InstWidening W, unsigned Cost) {
1159     assert(VF >= 2 && "Expected VF >=2");
1160     /// Broadcast this decicion to all instructions inside the group.
1161     /// But the cost will be assigned to one instruction only.
1162     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1163       if (auto *I = Grp->getMember(i)) {
1164         if (Grp->getInsertPos() == I)
1165           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1166         else
1167           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1168       }
1169     }
1170   }
1171 
1172   /// Return the cost model decision for the given instruction \p I and vector
1173   /// width \p VF. Return CM_Unknown if this instruction did not pass
1174   /// through the cost modeling.
1175   InstWidening getWideningDecision(Instruction *I, unsigned VF) {
1176     assert(VF >= 2 && "Expected VF >=2");
1177 
1178     // Cost model is not run in the VPlan-native path - return conservative
1179     // result until this changes.
1180     if (EnableVPlanNativePath)
1181       return CM_GatherScatter;
1182 
1183     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1184     auto Itr = WideningDecisions.find(InstOnVF);
1185     if (Itr == WideningDecisions.end())
1186       return CM_Unknown;
1187     return Itr->second.first;
1188   }
1189 
1190   /// Return the vectorization cost for the given instruction \p I and vector
1191   /// width \p VF.
1192   unsigned getWideningCost(Instruction *I, unsigned VF) {
1193     assert(VF >= 2 && "Expected VF >=2");
1194     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1195     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1196            "The cost is not calculated");
1197     return WideningDecisions[InstOnVF].second;
1198   }
1199 
1200   /// Return True if instruction \p I is an optimizable truncate whose operand
1201   /// is an induction variable. Such a truncate will be removed by adding a new
1202   /// induction variable with the destination type.
1203   bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1204     // If the instruction is not a truncate, return false.
1205     auto *Trunc = dyn_cast<TruncInst>(I);
1206     if (!Trunc)
1207       return false;
1208 
1209     // Get the source and destination types of the truncate.
1210     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1211     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1212 
1213     // If the truncate is free for the given types, return false. Replacing a
1214     // free truncate with an induction variable would add an induction variable
1215     // update instruction to each iteration of the loop. We exclude from this
1216     // check the primary induction variable since it will need an update
1217     // instruction regardless.
1218     Value *Op = Trunc->getOperand(0);
1219     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1220       return false;
1221 
1222     // If the truncated value is not an induction variable, return false.
1223     return Legal->isInductionPhi(Op);
1224   }
1225 
1226   /// Collects the instructions to scalarize for each predicated instruction in
1227   /// the loop.
1228   void collectInstsToScalarize(unsigned VF);
1229 
1230   /// Collect Uniform and Scalar values for the given \p VF.
1231   /// The sets depend on CM decision for Load/Store instructions
1232   /// that may be vectorized as interleave, gather-scatter or scalarized.
1233   void collectUniformsAndScalars(unsigned VF) {
1234     // Do the analysis once.
1235     if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
1236       return;
1237     setCostBasedWideningDecision(VF);
1238     collectLoopUniforms(VF);
1239     collectLoopScalars(VF);
1240   }
1241 
1242   /// Returns true if the target machine supports masked store operation
1243   /// for the given \p DataType and kind of access to \p Ptr.
1244   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) {
1245     return Legal->isConsecutivePtr(Ptr) &&
1246            TTI.isLegalMaskedStore(DataType, Alignment);
1247   }
1248 
1249   /// Returns true if the target machine supports masked load operation
1250   /// for the given \p DataType and kind of access to \p Ptr.
1251   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) {
1252     return Legal->isConsecutivePtr(Ptr) &&
1253            TTI.isLegalMaskedLoad(DataType, Alignment);
1254   }
1255 
1256   /// Returns true if the target machine supports masked scatter operation
1257   /// for the given \p DataType.
1258   bool isLegalMaskedScatter(Type *DataType, Align Alignment) {
1259     return TTI.isLegalMaskedScatter(DataType, Alignment);
1260   }
1261 
1262   /// Returns true if the target machine supports masked gather operation
1263   /// for the given \p DataType.
1264   bool isLegalMaskedGather(Type *DataType, Align Alignment) {
1265     return TTI.isLegalMaskedGather(DataType, Alignment);
1266   }
1267 
1268   /// Returns true if the target machine can represent \p V as a masked gather
1269   /// or scatter operation.
1270   bool isLegalGatherOrScatter(Value *V) {
1271     bool LI = isa<LoadInst>(V);
1272     bool SI = isa<StoreInst>(V);
1273     if (!LI && !SI)
1274       return false;
1275     auto *Ty = getMemInstValueType(V);
1276     Align Align = getLoadStoreAlignment(V);
1277     return (LI && isLegalMaskedGather(Ty, Align)) ||
1278            (SI && isLegalMaskedScatter(Ty, Align));
1279   }
1280 
1281   /// Returns true if \p I is an instruction that will be scalarized with
1282   /// predication. Such instructions include conditional stores and
1283   /// instructions that may divide by zero.
1284   /// If a non-zero VF has been calculated, we check if I will be scalarized
1285   /// predication for that VF.
1286   bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1287 
1288   // Returns true if \p I is an instruction that will be predicated either
1289   // through scalar predication or masked load/store or masked gather/scatter.
1290   // Superset of instructions that return true for isScalarWithPredication.
1291   bool isPredicatedInst(Instruction *I) {
1292     if (!blockNeedsPredication(I->getParent()))
1293       return false;
1294     // Loads and stores that need some form of masked operation are predicated
1295     // instructions.
1296     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1297       return Legal->isMaskRequired(I);
1298     return isScalarWithPredication(I);
1299   }
1300 
1301   /// Returns true if \p I is a memory instruction with consecutive memory
1302   /// access that can be widened.
1303   bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1304 
1305   /// Returns true if \p I is a memory instruction in an interleaved-group
1306   /// of memory accesses that can be vectorized with wide vector loads/stores
1307   /// and shuffles.
1308   bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
1309 
1310   /// Check if \p Instr belongs to any interleaved access group.
1311   bool isAccessInterleaved(Instruction *Instr) {
1312     return InterleaveInfo.isInterleaved(Instr);
1313   }
1314 
1315   /// Get the interleaved access group that \p Instr belongs to.
1316   const InterleaveGroup<Instruction> *
1317   getInterleavedAccessGroup(Instruction *Instr) {
1318     return InterleaveInfo.getInterleaveGroup(Instr);
1319   }
1320 
1321   /// Returns true if an interleaved group requires a scalar iteration
1322   /// to handle accesses with gaps, and there is nothing preventing us from
1323   /// creating a scalar epilogue.
1324   bool requiresScalarEpilogue() const {
1325     return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1326   }
1327 
1328   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1329   /// loop hint annotation.
1330   bool isScalarEpilogueAllowed() const {
1331     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1332   }
1333 
1334   /// Returns true if all loop blocks should be masked to fold tail loop.
1335   bool foldTailByMasking() const { return FoldTailByMasking; }
1336 
1337   bool blockNeedsPredication(BasicBlock *BB) {
1338     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1339   }
1340 
1341   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1342   /// with factor VF.  Return the cost of the instruction, including
1343   /// scalarization overhead if it's needed.
1344   unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);
1345 
1346   /// Estimate cost of a call instruction CI if it were vectorized with factor
1347   /// VF. Return the cost of the instruction, including scalarization overhead
1348   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1349   /// scalarized -
1350   /// i.e. either vector version isn't available, or is too expensive.
1351   unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
1352 
1353   /// Invalidates decisions already taken by the cost model.
1354   void invalidateCostModelingDecisions() {
1355     WideningDecisions.clear();
1356     Uniforms.clear();
1357     Scalars.clear();
1358   }
1359 
1360 private:
1361   unsigned NumPredStores = 0;
1362 
1363   /// \return An upper bound for the vectorization factor, a power-of-2 larger
1364   /// than zero. One is returned if vectorization should best be avoided due
1365   /// to cost.
1366   unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
1367 
1368   /// The vectorization cost is a combination of the cost itself and a boolean
1369   /// indicating whether any of the contributing operations will actually
1370   /// operate on
1371   /// vector values after type legalization in the backend. If this latter value
1372   /// is
1373   /// false, then all operations will be scalarized (i.e. no vectorization has
1374   /// actually taken place).
1375   using VectorizationCostTy = std::pair<unsigned, bool>;
1376 
1377   /// Returns the expected execution cost. The unit of the cost does
1378   /// not matter because we use the 'cost' units to compare different
1379   /// vector widths. The cost that is returned is *not* normalized by
1380   /// the factor width.
1381   VectorizationCostTy expectedCost(unsigned VF);
1382 
1383   /// Returns the execution time cost of an instruction for a given vector
1384   /// width. Vector width of one means scalar.
1385   VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1386 
1387   /// The cost-computation logic from getInstructionCost which provides
1388   /// the vector type as an output parameter.
1389   unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1390 
1391   /// Calculate vectorization cost of memory instruction \p I.
1392   unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1393 
1394   /// The cost computation for scalarized memory instruction.
1395   unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1396 
1397   /// The cost computation for interleaving group of memory instructions.
1398   unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1399 
1400   /// The cost computation for Gather/Scatter instruction.
1401   unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1402 
1403   /// The cost computation for widening instruction \p I with consecutive
1404   /// memory access.
1405   unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1406 
1407   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1408   /// Load: scalar load + broadcast.
1409   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1410   /// element)
1411   unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1412 
1413   /// Estimate the overhead of scalarizing an instruction. This is a
1414   /// convenience wrapper for the type-based getScalarizationOverhead API.
1415   unsigned getScalarizationOverhead(Instruction *I, unsigned VF);
1416 
1417   /// Returns whether the instruction is a load or store and will be a emitted
1418   /// as a vector operation.
1419   bool isConsecutiveLoadOrStore(Instruction *I);
1420 
1421   /// Returns true if an artificially high cost for emulated masked memrefs
1422   /// should be used.
1423   bool useEmulatedMaskMemRefHack(Instruction *I);
1424 
1425   /// Map of scalar integer values to the smallest bitwidth they can be legally
1426   /// represented as. The vector equivalents of these values should be truncated
1427   /// to this type.
1428   MapVector<Instruction *, uint64_t> MinBWs;
1429 
1430   /// A type representing the costs for instructions if they were to be
1431   /// scalarized rather than vectorized. The entries are Instruction-Cost
1432   /// pairs.
1433   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1434 
1435   /// A set containing all BasicBlocks that are known to present after
1436   /// vectorization as a predicated block.
1437   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1438 
1439   /// Records whether it is allowed to have the original scalar loop execute at
1440   /// least once. This may be needed as a fallback loop in case runtime
1441   /// aliasing/dependence checks fail, or to handle the tail/remainder
1442   /// iterations when the trip count is unknown or doesn't divide by the VF,
1443   /// or as a peel-loop to handle gaps in interleave-groups.
1444   /// Under optsize and when the trip count is very small we don't allow any
1445   /// iterations to execute in the scalar loop.
1446   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1447 
1448   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1449   bool FoldTailByMasking = false;
1450 
1451   /// A map holding scalar costs for different vectorization factors. The
1452   /// presence of a cost for an instruction in the mapping indicates that the
1453   /// instruction will be scalarized when vectorizing with the associated
1454   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1455   DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1456 
1457   /// Holds the instructions known to be uniform after vectorization.
1458   /// The data is collected per VF.
1459   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
1460 
1461   /// Holds the instructions known to be scalar after vectorization.
1462   /// The data is collected per VF.
1463   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
1464 
1465   /// Holds the instructions (address computations) that are forced to be
1466   /// scalarized.
1467   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1468 
1469   /// Returns the expected difference in cost from scalarizing the expression
1470   /// feeding a predicated instruction \p PredInst. The instructions to
1471   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1472   /// non-negative return value implies the expression will be scalarized.
1473   /// Currently, only single-use chains are considered for scalarization.
1474   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1475                               unsigned VF);
1476 
1477   /// Collect the instructions that are uniform after vectorization. An
1478   /// instruction is uniform if we represent it with a single scalar value in
1479   /// the vectorized loop corresponding to each vector iteration. Examples of
1480   /// uniform instructions include pointer operands of consecutive or
1481   /// interleaved memory accesses. Note that although uniformity implies an
1482   /// instruction will be scalar, the reverse is not true. In general, a
1483   /// scalarized instruction will be represented by VF scalar values in the
1484   /// vectorized loop, each corresponding to an iteration of the original
1485   /// scalar loop.
1486   void collectLoopUniforms(unsigned VF);
1487 
1488   /// Collect the instructions that are scalar after vectorization. An
1489   /// instruction is scalar if it is known to be uniform or will be scalarized
1490   /// during vectorization. Non-uniform scalarized instructions will be
1491   /// represented by VF values in the vectorized loop, each corresponding to an
1492   /// iteration of the original scalar loop.
1493   void collectLoopScalars(unsigned VF);
1494 
1495   /// Keeps cost model vectorization decision and cost for instructions.
1496   /// Right now it is used for memory instructions only.
1497   using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
1498                                 std::pair<InstWidening, unsigned>>;
1499 
1500   DecisionList WideningDecisions;
1501 
1502   /// Returns true if \p V is expected to be vectorized and it needs to be
1503   /// extracted.
1504   bool needsExtract(Value *V, unsigned VF) const {
1505     Instruction *I = dyn_cast<Instruction>(V);
1506     if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))
1507       return false;
1508 
1509     // Assume we can vectorize V (and hence we need extraction) if the
1510     // scalars are not computed yet. This can happen, because it is called
1511     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1512     // the scalars are collected. That should be a safe assumption in most
1513     // cases, because we check if the operands have vectorizable types
1514     // beforehand in LoopVectorizationLegality.
1515     return Scalars.find(VF) == Scalars.end() ||
1516            !isScalarAfterVectorization(I, VF);
1517   };
1518 
1519   /// Returns a range containing only operands needing to be extracted.
1520   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1521                                                    unsigned VF) {
1522     return SmallVector<Value *, 4>(make_filter_range(
1523         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1524   }
1525 
1526 public:
1527   /// The loop that we evaluate.
1528   Loop *TheLoop;
1529 
1530   /// Predicated scalar evolution analysis.
1531   PredicatedScalarEvolution &PSE;
1532 
1533   /// Loop Info analysis.
1534   LoopInfo *LI;
1535 
1536   /// Vectorization legality.
1537   LoopVectorizationLegality *Legal;
1538 
1539   /// Vector target information.
1540   const TargetTransformInfo &TTI;
1541 
1542   /// Target Library Info.
1543   const TargetLibraryInfo *TLI;
1544 
1545   /// Demanded bits analysis.
1546   DemandedBits *DB;
1547 
1548   /// Assumption cache.
1549   AssumptionCache *AC;
1550 
1551   /// Interface to emit optimization remarks.
1552   OptimizationRemarkEmitter *ORE;
1553 
1554   const Function *TheFunction;
1555 
1556   /// Loop Vectorize Hint.
1557   const LoopVectorizeHints *Hints;
1558 
1559   /// The interleave access information contains groups of interleaved accesses
1560   /// with the same stride and close to each other.
1561   InterleavedAccessInfo &InterleaveInfo;
1562 
1563   /// Values to ignore in the cost model.
1564   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1565 
1566   /// Values to ignore in the cost model when VF > 1.
1567   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1568 };
1569 
1570 } // end namespace llvm
1571 
1572 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1573 // vectorization. The loop needs to be annotated with #pragma omp simd
1574 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1575 // vector length information is not provided, vectorization is not considered
1576 // explicit. Interleave hints are not allowed either. These limitations will be
1577 // relaxed in the future.
1578 // Please, note that we are currently forced to abuse the pragma 'clang
1579 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1580 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1581 // provides *explicit vectorization hints* (LV can bypass legal checks and
1582 // assume that vectorization is legal). However, both hints are implemented
1583 // using the same metadata (llvm.loop.vectorize, processed by
1584 // LoopVectorizeHints). This will be fixed in the future when the native IR
1585 // representation for pragma 'omp simd' is introduced.
1586 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1587                                    OptimizationRemarkEmitter *ORE) {
1588   assert(!OuterLp->empty() && "This is not an outer loop");
1589   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1590 
1591   // Only outer loops with an explicit vectorization hint are supported.
1592   // Unannotated outer loops are ignored.
1593   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1594     return false;
1595 
1596   Function *Fn = OuterLp->getHeader()->getParent();
1597   if (!Hints.allowVectorization(Fn, OuterLp,
1598                                 true /*VectorizeOnlyWhenForced*/)) {
1599     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1600     return false;
1601   }
1602 
1603   if (Hints.getInterleave() > 1) {
1604     // TODO: Interleave support is future work.
1605     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1606                          "outer loops.\n");
1607     Hints.emitRemarkWithHints();
1608     return false;
1609   }
1610 
1611   return true;
1612 }
1613 
1614 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1615                                   OptimizationRemarkEmitter *ORE,
1616                                   SmallVectorImpl<Loop *> &V) {
1617   // Collect inner loops and outer loops without irreducible control flow. For
1618   // now, only collect outer loops that have explicit vectorization hints. If we
1619   // are stress testing the VPlan H-CFG construction, we collect the outermost
1620   // loop of every loop nest.
1621   if (L.empty() || VPlanBuildStressTest ||
1622       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1623     LoopBlocksRPO RPOT(&L);
1624     RPOT.perform(LI);
1625     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1626       V.push_back(&L);
1627       // TODO: Collect inner loops inside marked outer loops in case
1628       // vectorization fails for the outer loop. Do not invoke
1629       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1630       // already known to be reducible. We can use an inherited attribute for
1631       // that.
1632       return;
1633     }
1634   }
1635   for (Loop *InnerL : L)
1636     collectSupportedLoops(*InnerL, LI, ORE, V);
1637 }
1638 
1639 namespace {
1640 
1641 /// The LoopVectorize Pass.
1642 struct LoopVectorize : public FunctionPass {
1643   /// Pass identification, replacement for typeid
1644   static char ID;
1645 
1646   LoopVectorizePass Impl;
1647 
1648   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1649                          bool VectorizeOnlyWhenForced = false)
1650       : FunctionPass(ID),
1651         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
1652     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1653   }
1654 
1655   bool runOnFunction(Function &F) override {
1656     if (skipFunction(F))
1657       return false;
1658 
1659     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1660     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1661     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1662     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1663     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1664     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1665     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1666     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1667     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1668     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1669     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1670     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1671     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1672 
1673     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1674         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1675 
1676     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1677                         GetLAA, *ORE, PSI).MadeAnyChange;
1678   }
1679 
1680   void getAnalysisUsage(AnalysisUsage &AU) const override {
1681     AU.addRequired<AssumptionCacheTracker>();
1682     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1683     AU.addRequired<DominatorTreeWrapperPass>();
1684     AU.addRequired<LoopInfoWrapperPass>();
1685     AU.addRequired<ScalarEvolutionWrapperPass>();
1686     AU.addRequired<TargetTransformInfoWrapperPass>();
1687     AU.addRequired<AAResultsWrapperPass>();
1688     AU.addRequired<LoopAccessLegacyAnalysis>();
1689     AU.addRequired<DemandedBitsWrapperPass>();
1690     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1691     AU.addRequired<InjectTLIMappingsLegacy>();
1692 
1693     // We currently do not preserve loopinfo/dominator analyses with outer loop
1694     // vectorization. Until this is addressed, mark these analyses as preserved
1695     // only for non-VPlan-native path.
1696     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1697     if (!EnableVPlanNativePath) {
1698       AU.addPreserved<LoopInfoWrapperPass>();
1699       AU.addPreserved<DominatorTreeWrapperPass>();
1700     }
1701 
1702     AU.addPreserved<BasicAAWrapperPass>();
1703     AU.addPreserved<GlobalsAAWrapperPass>();
1704     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1705   }
1706 };
1707 
1708 } // end anonymous namespace
1709 
1710 //===----------------------------------------------------------------------===//
1711 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1712 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1713 //===----------------------------------------------------------------------===//
1714 
1715 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1716   // We need to place the broadcast of invariant variables outside the loop,
1717   // but only if it's proven safe to do so. Else, broadcast will be inside
1718   // vector loop body.
1719   Instruction *Instr = dyn_cast<Instruction>(V);
1720   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1721                      (!Instr ||
1722                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1723   // Place the code for broadcasting invariant variables in the new preheader.
1724   IRBuilder<>::InsertPointGuard Guard(Builder);
1725   if (SafeToHoist)
1726     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1727 
1728   // Broadcast the scalar into all locations in the vector.
1729   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1730 
1731   return Shuf;
1732 }
1733 
1734 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1735     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1736   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1737          "Expected either an induction phi-node or a truncate of it!");
1738   Value *Start = II.getStartValue();
1739 
1740   // Construct the initial value of the vector IV in the vector loop preheader
1741   auto CurrIP = Builder.saveIP();
1742   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1743   if (isa<TruncInst>(EntryVal)) {
1744     assert(Start->getType()->isIntegerTy() &&
1745            "Truncation requires an integer type");
1746     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1747     Step = Builder.CreateTrunc(Step, TruncType);
1748     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1749   }
1750   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1751   Value *SteppedStart =
1752       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1753 
1754   // We create vector phi nodes for both integer and floating-point induction
1755   // variables. Here, we determine the kind of arithmetic we will perform.
1756   Instruction::BinaryOps AddOp;
1757   Instruction::BinaryOps MulOp;
1758   if (Step->getType()->isIntegerTy()) {
1759     AddOp = Instruction::Add;
1760     MulOp = Instruction::Mul;
1761   } else {
1762     AddOp = II.getInductionOpcode();
1763     MulOp = Instruction::FMul;
1764   }
1765 
1766   // Multiply the vectorization factor by the step using integer or
1767   // floating-point arithmetic as appropriate.
1768   Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1769   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1770 
1771   // Create a vector splat to use in the induction update.
1772   //
1773   // FIXME: If the step is non-constant, we create the vector splat with
1774   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1775   //        handle a constant vector splat.
1776   Value *SplatVF =
1777       isa<Constant>(Mul)
1778           ? ConstantVector::getSplat({VF, false}, cast<Constant>(Mul))
1779           : Builder.CreateVectorSplat(VF, Mul);
1780   Builder.restoreIP(CurrIP);
1781 
1782   // We may need to add the step a number of times, depending on the unroll
1783   // factor. The last of those goes into the PHI.
1784   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1785                                     &*LoopVectorBody->getFirstInsertionPt());
1786   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1787   Instruction *LastInduction = VecInd;
1788   for (unsigned Part = 0; Part < UF; ++Part) {
1789     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1790 
1791     if (isa<TruncInst>(EntryVal))
1792       addMetadata(LastInduction, EntryVal);
1793     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1794 
1795     LastInduction = cast<Instruction>(addFastMathFlag(
1796         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1797     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1798   }
1799 
1800   // Move the last step to the end of the latch block. This ensures consistent
1801   // placement of all induction updates.
1802   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1803   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1804   auto *ICmp = cast<Instruction>(Br->getCondition());
1805   LastInduction->moveBefore(ICmp);
1806   LastInduction->setName("vec.ind.next");
1807 
1808   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1809   VecInd->addIncoming(LastInduction, LoopVectorLatch);
1810 }
1811 
1812 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1813   return Cost->isScalarAfterVectorization(I, VF) ||
1814          Cost->isProfitableToScalarize(I, VF);
1815 }
1816 
1817 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1818   if (shouldScalarizeInstruction(IV))
1819     return true;
1820   auto isScalarInst = [&](User *U) -> bool {
1821     auto *I = cast<Instruction>(U);
1822     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1823   };
1824   return llvm::any_of(IV->users(), isScalarInst);
1825 }
1826 
1827 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1828     const InductionDescriptor &ID, const Instruction *EntryVal,
1829     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1830   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1831          "Expected either an induction phi-node or a truncate of it!");
1832 
1833   // This induction variable is not the phi from the original loop but the
1834   // newly-created IV based on the proof that casted Phi is equal to the
1835   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1836   // re-uses the same InductionDescriptor that original IV uses but we don't
1837   // have to do any recording in this case - that is done when original IV is
1838   // processed.
1839   if (isa<TruncInst>(EntryVal))
1840     return;
1841 
1842   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1843   if (Casts.empty())
1844     return;
1845   // Only the first Cast instruction in the Casts vector is of interest.
1846   // The rest of the Casts (if exist) have no uses outside the
1847   // induction update chain itself.
1848   Instruction *CastInst = *Casts.begin();
1849   if (Lane < UINT_MAX)
1850     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1851   else
1852     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1853 }
1854 
1855 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1856   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1857          "Primary induction variable must have an integer type");
1858 
1859   auto II = Legal->getInductionVars().find(IV);
1860   assert(II != Legal->getInductionVars().end() && "IV is not an induction");
1861 
1862   auto ID = II->second;
1863   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1864 
1865   // The value from the original loop to which we are mapping the new induction
1866   // variable.
1867   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1868 
1869   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1870 
1871   // Generate code for the induction step. Note that induction steps are
1872   // required to be loop-invariant
1873   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
1874     assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
1875            "Induction step should be loop invariant");
1876     if (PSE.getSE()->isSCEVable(IV->getType())) {
1877       SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1878       return Exp.expandCodeFor(Step, Step->getType(),
1879                                LoopVectorPreHeader->getTerminator());
1880     }
1881     return cast<SCEVUnknown>(Step)->getValue();
1882   };
1883 
1884   // The scalar value to broadcast. This is derived from the canonical
1885   // induction variable. If a truncation type is given, truncate the canonical
1886   // induction variable and step. Otherwise, derive these values from the
1887   // induction descriptor.
1888   auto CreateScalarIV = [&](Value *&Step) -> Value * {
1889     Value *ScalarIV = Induction;
1890     if (IV != OldInduction) {
1891       ScalarIV = IV->getType()->isIntegerTy()
1892                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1893                      : Builder.CreateCast(Instruction::SIToFP, Induction,
1894                                           IV->getType());
1895       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1896       ScalarIV->setName("offset.idx");
1897     }
1898     if (Trunc) {
1899       auto *TruncType = cast<IntegerType>(Trunc->getType());
1900       assert(Step->getType()->isIntegerTy() &&
1901              "Truncation requires an integer step");
1902       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1903       Step = Builder.CreateTrunc(Step, TruncType);
1904     }
1905     return ScalarIV;
1906   };
1907 
1908   // Create the vector values from the scalar IV, in the absence of creating a
1909   // vector IV.
1910   auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
1911     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1912     for (unsigned Part = 0; Part < UF; ++Part) {
1913       Value *EntryPart =
1914           getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1915       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1916       if (Trunc)
1917         addMetadata(EntryPart, Trunc);
1918       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1919     }
1920   };
1921 
1922   // Now do the actual transformations, and start with creating the step value.
1923   Value *Step = CreateStepValue(ID.getStep());
1924   if (VF <= 1) {
1925     Value *ScalarIV = CreateScalarIV(Step);
1926     CreateSplatIV(ScalarIV, Step);
1927     return;
1928   }
1929 
1930   // Determine if we want a scalar version of the induction variable. This is
1931   // true if the induction variable itself is not widened, or if it has at
1932   // least one user in the loop that is not widened.
1933   auto NeedsScalarIV = needsScalarInduction(EntryVal);
1934   if (!NeedsScalarIV) {
1935     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1936     return;
1937   }
1938 
1939   // Try to create a new independent vector induction variable. If we can't
1940   // create the phi node, we will splat the scalar induction variable in each
1941   // loop iteration.
1942   if (!shouldScalarizeInstruction(EntryVal)) {
1943     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1944     Value *ScalarIV = CreateScalarIV(Step);
1945     // Create scalar steps that can be used by instructions we will later
1946     // scalarize. Note that the addition of the scalar steps will not increase
1947     // the number of instructions in the loop in the common case prior to
1948     // InstCombine. We will be trading one vector extract for each scalar step.
1949     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1950     return;
1951   }
1952 
1953   // All IV users are scalar instructions, so only emit a scalar IV, not a
1954   // vectorised IV. Except when we tail-fold, then the splat IV feeds the
1955   // predicate used by the masked loads/stores.
1956   Value *ScalarIV = CreateScalarIV(Step);
1957   if (!Cost->isScalarEpilogueAllowed())
1958     CreateSplatIV(ScalarIV, Step);
1959   buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1960 }
1961 
1962 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
1963                                           Instruction::BinaryOps BinOp) {
1964   // Create and check the types.
1965   auto *ValVTy = cast<VectorType>(Val->getType());
1966   int VLen = ValVTy->getNumElements();
1967 
1968   Type *STy = Val->getType()->getScalarType();
1969   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1970          "Induction Step must be an integer or FP");
1971   assert(Step->getType() == STy && "Step has wrong type");
1972 
1973   SmallVector<Constant *, 8> Indices;
1974 
1975   if (STy->isIntegerTy()) {
1976     // Create a vector of consecutive numbers from zero to VF.
1977     for (int i = 0; i < VLen; ++i)
1978       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
1979 
1980     // Add the consecutive indices to the vector value.
1981     Constant *Cv = ConstantVector::get(Indices);
1982     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
1983     Step = Builder.CreateVectorSplat(VLen, Step);
1984     assert(Step->getType() == Val->getType() && "Invalid step vec");
1985     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1986     // which can be found from the original scalar operations.
1987     Step = Builder.CreateMul(Cv, Step);
1988     return Builder.CreateAdd(Val, Step, "induction");
1989   }
1990 
1991   // Floating point induction.
1992   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1993          "Binary Opcode should be specified for FP induction");
1994   // Create a vector of consecutive numbers from zero to VF.
1995   for (int i = 0; i < VLen; ++i)
1996     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
1997 
1998   // Add the consecutive indices to the vector value.
1999   Constant *Cv = ConstantVector::get(Indices);
2000 
2001   Step = Builder.CreateVectorSplat(VLen, Step);
2002 
2003   // Floating point operations had to be 'fast' to enable the induction.
2004   FastMathFlags Flags;
2005   Flags.setFast();
2006 
2007   Value *MulOp = Builder.CreateFMul(Cv, Step);
2008   if (isa<Instruction>(MulOp))
2009     // Have to check, MulOp may be a constant
2010     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
2011 
2012   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2013   if (isa<Instruction>(BOp))
2014     cast<Instruction>(BOp)->setFastMathFlags(Flags);
2015   return BOp;
2016 }
2017 
2018 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2019                                            Instruction *EntryVal,
2020                                            const InductionDescriptor &ID) {
2021   // We shouldn't have to build scalar steps if we aren't vectorizing.
2022   assert(VF > 1 && "VF should be greater than one");
2023 
2024   // Get the value type and ensure it and the step have the same integer type.
2025   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2026   assert(ScalarIVTy == Step->getType() &&
2027          "Val and Step should have the same type");
2028 
2029   // We build scalar steps for both integer and floating-point induction
2030   // variables. Here, we determine the kind of arithmetic we will perform.
2031   Instruction::BinaryOps AddOp;
2032   Instruction::BinaryOps MulOp;
2033   if (ScalarIVTy->isIntegerTy()) {
2034     AddOp = Instruction::Add;
2035     MulOp = Instruction::Mul;
2036   } else {
2037     AddOp = ID.getInductionOpcode();
2038     MulOp = Instruction::FMul;
2039   }
2040 
2041   // Determine the number of scalars we need to generate for each unroll
2042   // iteration. If EntryVal is uniform, we only need to generate the first
2043   // lane. Otherwise, we generate all VF values.
2044   unsigned Lanes =
2045       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
2046                                                                          : VF;
2047   // Compute the scalar steps and save the results in VectorLoopValueMap.
2048   for (unsigned Part = 0; Part < UF; ++Part) {
2049     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2050       auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
2051       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
2052       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
2053       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
2054       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
2055     }
2056   }
2057 }
2058 
2059 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
2060   assert(V != Induction && "The new induction variable should not be used.");
2061   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
2062   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2063 
2064   // If we have a stride that is replaced by one, do it here. Defer this for
2065   // the VPlan-native path until we start running Legal checks in that path.
2066   if (!EnableVPlanNativePath && Legal->hasStride(V))
2067     V = ConstantInt::get(V->getType(), 1);
2068 
2069   // If we have a vector mapped to this value, return it.
2070   if (VectorLoopValueMap.hasVectorValue(V, Part))
2071     return VectorLoopValueMap.getVectorValue(V, Part);
2072 
2073   // If the value has not been vectorized, check if it has been scalarized
2074   // instead. If it has been scalarized, and we actually need the value in
2075   // vector form, we will construct the vector values on demand.
2076   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2077     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2078 
2079     // If we've scalarized a value, that value should be an instruction.
2080     auto *I = cast<Instruction>(V);
2081 
2082     // If we aren't vectorizing, we can just copy the scalar map values over to
2083     // the vector map.
2084     if (VF == 1) {
2085       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2086       return ScalarValue;
2087     }
2088 
2089     // Get the last scalar instruction we generated for V and Part. If the value
2090     // is known to be uniform after vectorization, this corresponds to lane zero
2091     // of the Part unroll iteration. Otherwise, the last instruction is the one
2092     // we created for the last vector lane of the Part unroll iteration.
2093     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
2094     auto *LastInst = cast<Instruction>(
2095         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2096 
2097     // Set the insert point after the last scalarized instruction. This ensures
2098     // the insertelement sequence will directly follow the scalar definitions.
2099     auto OldIP = Builder.saveIP();
2100     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2101     Builder.SetInsertPoint(&*NewIP);
2102 
2103     // However, if we are vectorizing, we need to construct the vector values.
2104     // If the value is known to be uniform after vectorization, we can just
2105     // broadcast the scalar value corresponding to lane zero for each unroll
2106     // iteration. Otherwise, we construct the vector values using insertelement
2107     // instructions. Since the resulting vectors are stored in
2108     // VectorLoopValueMap, we will only generate the insertelements once.
2109     Value *VectorValue = nullptr;
2110     if (Cost->isUniformAfterVectorization(I, VF)) {
2111       VectorValue = getBroadcastInstrs(ScalarValue);
2112       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2113     } else {
2114       // Initialize packing with insertelements to start from undef.
2115       Value *Undef = UndefValue::get(FixedVectorType::get(V->getType(), VF));
2116       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2117       for (unsigned Lane = 0; Lane < VF; ++Lane)
2118         packScalarIntoVectorValue(V, {Part, Lane});
2119       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2120     }
2121     Builder.restoreIP(OldIP);
2122     return VectorValue;
2123   }
2124 
2125   // If this scalar is unknown, assume that it is a constant or that it is
2126   // loop invariant. Broadcast V and save the value for future uses.
2127   Value *B = getBroadcastInstrs(V);
2128   VectorLoopValueMap.setVectorValue(V, Part, B);
2129   return B;
2130 }
2131 
2132 Value *
2133 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2134                                             const VPIteration &Instance) {
2135   // If the value is not an instruction contained in the loop, it should
2136   // already be scalar.
2137   if (OrigLoop->isLoopInvariant(V))
2138     return V;
2139 
2140   assert(Instance.Lane > 0
2141              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2142              : true && "Uniform values only have lane zero");
2143 
2144   // If the value from the original loop has not been vectorized, it is
2145   // represented by UF x VF scalar values in the new loop. Return the requested
2146   // scalar value.
2147   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2148     return VectorLoopValueMap.getScalarValue(V, Instance);
2149 
2150   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2151   // for the given unroll part. If this entry is not a vector type (i.e., the
2152   // vectorization factor is one), there is no need to generate an
2153   // extractelement instruction.
2154   auto *U = getOrCreateVectorValue(V, Instance.Part);
2155   if (!U->getType()->isVectorTy()) {
2156     assert(VF == 1 && "Value not scalarized has non-vector type");
2157     return U;
2158   }
2159 
2160   // Otherwise, the value from the original loop has been vectorized and is
2161   // represented by UF vector values. Extract and return the requested scalar
2162   // value from the appropriate vector lane.
2163   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2164 }
2165 
2166 void InnerLoopVectorizer::packScalarIntoVectorValue(
2167     Value *V, const VPIteration &Instance) {
2168   assert(V != Induction && "The new induction variable should not be used.");
2169   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2170   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2171 
2172   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2173   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2174   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2175                                             Builder.getInt32(Instance.Lane));
2176   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2177 }
2178 
2179 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2180   assert(Vec->getType()->isVectorTy() && "Invalid type");
2181   SmallVector<int, 8> ShuffleMask;
2182   for (unsigned i = 0; i < VF; ++i)
2183     ShuffleMask.push_back(VF - i - 1);
2184 
2185   return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2186                                      ShuffleMask, "reverse");
2187 }
2188 
2189 // Return whether we allow using masked interleave-groups (for dealing with
2190 // strided loads/stores that reside in predicated blocks, or for dealing
2191 // with gaps).
2192 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2193   // If an override option has been passed in for interleaved accesses, use it.
2194   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2195     return EnableMaskedInterleavedMemAccesses;
2196 
2197   return TTI.enableMaskedInterleavedAccessVectorization();
2198 }
2199 
2200 // Try to vectorize the interleave group that \p Instr belongs to.
2201 //
2202 // E.g. Translate following interleaved load group (factor = 3):
2203 //   for (i = 0; i < N; i+=3) {
2204 //     R = Pic[i];             // Member of index 0
2205 //     G = Pic[i+1];           // Member of index 1
2206 //     B = Pic[i+2];           // Member of index 2
2207 //     ... // do something to R, G, B
2208 //   }
2209 // To:
2210 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2211 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2212 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2213 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2214 //
2215 // Or translate following interleaved store group (factor = 3):
2216 //   for (i = 0; i < N; i+=3) {
2217 //     ... do something to R, G, B
2218 //     Pic[i]   = R;           // Member of index 0
2219 //     Pic[i+1] = G;           // Member of index 1
2220 //     Pic[i+2] = B;           // Member of index 2
2221 //   }
2222 // To:
2223 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2224 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2225 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2226 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2227 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2228 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2229     const InterleaveGroup<Instruction> *Group, VPTransformState &State,
2230     VPValue *Addr, VPValue *BlockInMask) {
2231   Instruction *Instr = Group->getInsertPos();
2232   const DataLayout &DL = Instr->getModule()->getDataLayout();
2233 
2234   // Prepare for the vector type of the interleaved load/store.
2235   Type *ScalarTy = getMemInstValueType(Instr);
2236   unsigned InterleaveFactor = Group->getFactor();
2237   auto *VecTy = FixedVectorType::get(ScalarTy, InterleaveFactor * VF);
2238 
2239   // Prepare for the new pointers.
2240   SmallVector<Value *, 2> AddrParts;
2241   unsigned Index = Group->getIndex(Instr);
2242 
2243   // TODO: extend the masked interleaved-group support to reversed access.
2244   assert((!BlockInMask || !Group->isReverse()) &&
2245          "Reversed masked interleave-group not supported.");
2246 
2247   // If the group is reverse, adjust the index to refer to the last vector lane
2248   // instead of the first. We adjust the index from the first vector lane,
2249   // rather than directly getting the pointer for lane VF - 1, because the
2250   // pointer operand of the interleaved access is supposed to be uniform. For
2251   // uniform instructions, we're only required to generate a value for the
2252   // first vector lane in each unroll iteration.
2253   if (Group->isReverse())
2254     Index += (VF - 1) * Group->getFactor();
2255 
2256   for (unsigned Part = 0; Part < UF; Part++) {
2257     Value *AddrPart = State.get(Addr, {Part, 0});
2258     setDebugLocFromInst(Builder, AddrPart);
2259 
2260     // Notice current instruction could be any index. Need to adjust the address
2261     // to the member of index 0.
2262     //
2263     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2264     //       b = A[i];       // Member of index 0
2265     // Current pointer is pointed to A[i+1], adjust it to A[i].
2266     //
2267     // E.g.  A[i+1] = a;     // Member of index 1
2268     //       A[i]   = b;     // Member of index 0
2269     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2270     // Current pointer is pointed to A[i+2], adjust it to A[i].
2271 
2272     bool InBounds = false;
2273     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2274       InBounds = gep->isInBounds();
2275     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2276     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2277 
2278     // Cast to the vector pointer type.
2279     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2280     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2281     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2282   }
2283 
2284   setDebugLocFromInst(Builder, Instr);
2285   Value *UndefVec = UndefValue::get(VecTy);
2286 
2287   Value *MaskForGaps = nullptr;
2288   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2289     MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
2290     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2291   }
2292 
2293   // Vectorize the interleaved load group.
2294   if (isa<LoadInst>(Instr)) {
2295     // For each unroll part, create a wide load for the group.
2296     SmallVector<Value *, 2> NewLoads;
2297     for (unsigned Part = 0; Part < UF; Part++) {
2298       Instruction *NewLoad;
2299       if (BlockInMask || MaskForGaps) {
2300         assert(useMaskedInterleavedAccesses(*TTI) &&
2301                "masked interleaved groups are not allowed.");
2302         Value *GroupMask = MaskForGaps;
2303         if (BlockInMask) {
2304           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2305           auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
2306           Value *ShuffledMask = Builder.CreateShuffleVector(
2307               BlockInMaskPart, Undefs,
2308               createReplicatedMask(InterleaveFactor, VF), "interleaved.mask");
2309           GroupMask = MaskForGaps
2310                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2311                                                 MaskForGaps)
2312                           : ShuffledMask;
2313         }
2314         NewLoad =
2315             Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
2316                                      GroupMask, UndefVec, "wide.masked.vec");
2317       }
2318       else
2319         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2320                                             Group->getAlign(), "wide.vec");
2321       Group->addMetadata(NewLoad);
2322       NewLoads.push_back(NewLoad);
2323     }
2324 
2325     // For each member in the group, shuffle out the appropriate data from the
2326     // wide loads.
2327     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2328       Instruction *Member = Group->getMember(I);
2329 
2330       // Skip the gaps in the group.
2331       if (!Member)
2332         continue;
2333 
2334       auto StrideMask = createStrideMask(I, InterleaveFactor, VF);
2335       for (unsigned Part = 0; Part < UF; Part++) {
2336         Value *StridedVec = Builder.CreateShuffleVector(
2337             NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2338 
2339         // If this member has different type, cast the result type.
2340         if (Member->getType() != ScalarTy) {
2341           VectorType *OtherVTy = FixedVectorType::get(Member->getType(), VF);
2342           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2343         }
2344 
2345         if (Group->isReverse())
2346           StridedVec = reverseVector(StridedVec);
2347 
2348         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2349       }
2350     }
2351     return;
2352   }
2353 
2354   // The sub vector type for current instruction.
2355   auto *SubVT = FixedVectorType::get(ScalarTy, VF);
2356 
2357   // Vectorize the interleaved store group.
2358   for (unsigned Part = 0; Part < UF; Part++) {
2359     // Collect the stored vector from each member.
2360     SmallVector<Value *, 4> StoredVecs;
2361     for (unsigned i = 0; i < InterleaveFactor; i++) {
2362       // Interleaved store group doesn't allow a gap, so each index has a member
2363       Instruction *Member = Group->getMember(i);
2364       assert(Member && "Fail to get a member from an interleaved store group");
2365 
2366       Value *StoredVec = getOrCreateVectorValue(
2367           cast<StoreInst>(Member)->getValueOperand(), Part);
2368       if (Group->isReverse())
2369         StoredVec = reverseVector(StoredVec);
2370 
2371       // If this member has different type, cast it to a unified type.
2372 
2373       if (StoredVec->getType() != SubVT)
2374         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2375 
2376       StoredVecs.push_back(StoredVec);
2377     }
2378 
2379     // Concatenate all vectors into a wide vector.
2380     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2381 
2382     // Interleave the elements in the wide vector.
2383     Value *IVec = Builder.CreateShuffleVector(
2384         WideVec, UndefVec, createInterleaveMask(VF, InterleaveFactor),
2385         "interleaved.vec");
2386 
2387     Instruction *NewStoreInstr;
2388     if (BlockInMask) {
2389       Value *BlockInMaskPart = State.get(BlockInMask, Part);
2390       auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
2391       Value *ShuffledMask = Builder.CreateShuffleVector(
2392           BlockInMaskPart, Undefs, createReplicatedMask(InterleaveFactor, VF),
2393           "interleaved.mask");
2394       NewStoreInstr = Builder.CreateMaskedStore(
2395           IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
2396     }
2397     else
2398       NewStoreInstr =
2399           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2400 
2401     Group->addMetadata(NewStoreInstr);
2402   }
2403 }
2404 
2405 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2406                                                      VPTransformState &State,
2407                                                      VPValue *Addr,
2408                                                      VPValue *StoredValue,
2409                                                      VPValue *BlockInMask) {
2410   // Attempt to issue a wide load.
2411   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2412   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2413 
2414   assert((LI || SI) && "Invalid Load/Store instruction");
2415   assert((!SI || StoredValue) && "No stored value provided for widened store");
2416   assert((!LI || !StoredValue) && "Stored value provided for widened load");
2417 
2418   LoopVectorizationCostModel::InstWidening Decision =
2419       Cost->getWideningDecision(Instr, VF);
2420   assert((Decision == LoopVectorizationCostModel::CM_Widen ||
2421           Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||
2422           Decision == LoopVectorizationCostModel::CM_GatherScatter) &&
2423          "CM decision is not to widen the memory instruction");
2424 
2425   Type *ScalarDataTy = getMemInstValueType(Instr);
2426   auto *DataTy = FixedVectorType::get(ScalarDataTy, VF);
2427   const Align Alignment = getLoadStoreAlignment(Instr);
2428 
2429   // Determine if the pointer operand of the access is either consecutive or
2430   // reverse consecutive.
2431   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2432   bool ConsecutiveStride =
2433       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2434   bool CreateGatherScatter =
2435       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2436 
2437   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2438   // gather/scatter. Otherwise Decision should have been to Scalarize.
2439   assert((ConsecutiveStride || CreateGatherScatter) &&
2440          "The instruction should be scalarized");
2441   (void)ConsecutiveStride;
2442 
2443   VectorParts BlockInMaskParts(UF);
2444   bool isMaskRequired = BlockInMask;
2445   if (isMaskRequired)
2446     for (unsigned Part = 0; Part < UF; ++Part)
2447       BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2448 
2449   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2450     // Calculate the pointer for the specific unroll-part.
2451     GetElementPtrInst *PartPtr = nullptr;
2452 
2453     bool InBounds = false;
2454     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2455       InBounds = gep->isInBounds();
2456 
2457     if (Reverse) {
2458       // If the address is consecutive but reversed, then the
2459       // wide store needs to start at the last vector element.
2460       PartPtr = cast<GetElementPtrInst>(
2461           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
2462       PartPtr->setIsInBounds(InBounds);
2463       PartPtr = cast<GetElementPtrInst>(
2464           Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
2465       PartPtr->setIsInBounds(InBounds);
2466       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2467         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2468     } else {
2469       PartPtr = cast<GetElementPtrInst>(
2470           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
2471       PartPtr->setIsInBounds(InBounds);
2472     }
2473 
2474     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2475     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2476   };
2477 
2478   // Handle Stores:
2479   if (SI) {
2480     setDebugLocFromInst(Builder, SI);
2481 
2482     for (unsigned Part = 0; Part < UF; ++Part) {
2483       Instruction *NewSI = nullptr;
2484       Value *StoredVal = State.get(StoredValue, Part);
2485       if (CreateGatherScatter) {
2486         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2487         Value *VectorGep = State.get(Addr, Part);
2488         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2489                                             MaskPart);
2490       } else {
2491         if (Reverse) {
2492           // If we store to reverse consecutive memory locations, then we need
2493           // to reverse the order of elements in the stored value.
2494           StoredVal = reverseVector(StoredVal);
2495           // We don't want to update the value in the map as it might be used in
2496           // another expression. So don't call resetVectorValue(StoredVal).
2497         }
2498         auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2499         if (isMaskRequired)
2500           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2501                                             BlockInMaskParts[Part]);
2502         else
2503           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2504       }
2505       addMetadata(NewSI, SI);
2506     }
2507     return;
2508   }
2509 
2510   // Handle loads.
2511   assert(LI && "Must have a load instruction");
2512   setDebugLocFromInst(Builder, LI);
2513   for (unsigned Part = 0; Part < UF; ++Part) {
2514     Value *NewLI;
2515     if (CreateGatherScatter) {
2516       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2517       Value *VectorGep = State.get(Addr, Part);
2518       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2519                                          nullptr, "wide.masked.gather");
2520       addMetadata(NewLI, LI);
2521     } else {
2522       auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2523       if (isMaskRequired)
2524         NewLI = Builder.CreateMaskedLoad(
2525             VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy),
2526             "wide.masked.load");
2527       else
2528         NewLI =
2529             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2530 
2531       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2532       addMetadata(NewLI, LI);
2533       if (Reverse)
2534         NewLI = reverseVector(NewLI);
2535     }
2536     VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2537   }
2538 }
2539 
2540 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User,
2541                                                const VPIteration &Instance,
2542                                                bool IfPredicateInstr,
2543                                                VPTransformState &State) {
2544   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2545 
2546   setDebugLocFromInst(Builder, Instr);
2547 
2548   // Does this instruction return a value ?
2549   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2550 
2551   Instruction *Cloned = Instr->clone();
2552   if (!IsVoidRetTy)
2553     Cloned->setName(Instr->getName() + ".cloned");
2554 
2555   // Replace the operands of the cloned instructions with their scalar
2556   // equivalents in the new loop.
2557   for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
2558     auto *NewOp = State.get(User.getOperand(op), Instance);
2559     Cloned->setOperand(op, NewOp);
2560   }
2561   addNewMetadata(Cloned, Instr);
2562 
2563   // Place the cloned scalar in the new loop.
2564   Builder.Insert(Cloned);
2565 
2566   // Add the cloned scalar to the scalar map entry.
2567   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2568 
2569   // If we just cloned a new assumption, add it the assumption cache.
2570   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2571     if (II->getIntrinsicID() == Intrinsic::assume)
2572       AC->registerAssumption(II);
2573 
2574   // End if-block.
2575   if (IfPredicateInstr)
2576     PredicatedInstructions.push_back(Cloned);
2577 }
2578 
2579 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2580                                                       Value *End, Value *Step,
2581                                                       Instruction *DL) {
2582   BasicBlock *Header = L->getHeader();
2583   BasicBlock *Latch = L->getLoopLatch();
2584   // As we're just creating this loop, it's possible no latch exists
2585   // yet. If so, use the header as this will be a single block loop.
2586   if (!Latch)
2587     Latch = Header;
2588 
2589   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2590   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2591   setDebugLocFromInst(Builder, OldInst);
2592   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2593 
2594   Builder.SetInsertPoint(Latch->getTerminator());
2595   setDebugLocFromInst(Builder, OldInst);
2596 
2597   // Create i+1 and fill the PHINode.
2598   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2599   Induction->addIncoming(Start, L->getLoopPreheader());
2600   Induction->addIncoming(Next, Latch);
2601   // Create the compare.
2602   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2603   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2604 
2605   // Now we have two terminators. Remove the old one from the block.
2606   Latch->getTerminator()->eraseFromParent();
2607 
2608   return Induction;
2609 }
2610 
2611 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2612   if (TripCount)
2613     return TripCount;
2614 
2615   assert(L && "Create Trip Count for null loop.");
2616   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2617   // Find the loop boundaries.
2618   ScalarEvolution *SE = PSE.getSE();
2619   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2620   assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2621          "Invalid loop count");
2622 
2623   Type *IdxTy = Legal->getWidestInductionType();
2624   assert(IdxTy && "No type for induction");
2625 
2626   // The exit count might have the type of i64 while the phi is i32. This can
2627   // happen if we have an induction variable that is sign extended before the
2628   // compare. The only way that we get a backedge taken count is that the
2629   // induction variable was signed and as such will not overflow. In such a case
2630   // truncation is legal.
2631   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
2632       IdxTy->getPrimitiveSizeInBits())
2633     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2634   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2635 
2636   // Get the total trip count from the count by adding 1.
2637   const SCEV *ExitCount = SE->getAddExpr(
2638       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2639 
2640   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2641 
2642   // Expand the trip count and place the new instructions in the preheader.
2643   // Notice that the pre-header does not change, only the loop body.
2644   SCEVExpander Exp(*SE, DL, "induction");
2645 
2646   // Count holds the overall loop count (N).
2647   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2648                                 L->getLoopPreheader()->getTerminator());
2649 
2650   if (TripCount->getType()->isPointerTy())
2651     TripCount =
2652         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2653                                     L->getLoopPreheader()->getTerminator());
2654 
2655   return TripCount;
2656 }
2657 
2658 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2659   if (VectorTripCount)
2660     return VectorTripCount;
2661 
2662   Value *TC = getOrCreateTripCount(L);
2663   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2664 
2665   Type *Ty = TC->getType();
2666   Constant *Step = ConstantInt::get(Ty, VF * UF);
2667 
2668   // If the tail is to be folded by masking, round the number of iterations N
2669   // up to a multiple of Step instead of rounding down. This is done by first
2670   // adding Step-1 and then rounding down. Note that it's ok if this addition
2671   // overflows: the vector induction variable will eventually wrap to zero given
2672   // that it starts at zero and its Step is a power of two; the loop will then
2673   // exit, with the last early-exit vector comparison also producing all-true.
2674   if (Cost->foldTailByMasking()) {
2675     assert(isPowerOf2_32(VF * UF) &&
2676            "VF*UF must be a power of 2 when folding tail by masking");
2677     TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
2678   }
2679 
2680   // Now we need to generate the expression for the part of the loop that the
2681   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2682   // iterations are not required for correctness, or N - Step, otherwise. Step
2683   // is equal to the vectorization factor (number of SIMD elements) times the
2684   // unroll factor (number of SIMD instructions).
2685   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2686 
2687   // If there is a non-reversed interleaved group that may speculatively access
2688   // memory out-of-bounds, we need to ensure that there will be at least one
2689   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2690   // the trip count, we set the remainder to be equal to the step. If the step
2691   // does not evenly divide the trip count, no adjustment is necessary since
2692   // there will already be scalar iterations. Note that the minimum iterations
2693   // check ensures that N >= Step.
2694   if (VF > 1 && Cost->requiresScalarEpilogue()) {
2695     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2696     R = Builder.CreateSelect(IsZero, Step, R);
2697   }
2698 
2699   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2700 
2701   return VectorTripCount;
2702 }
2703 
2704 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2705                                                    const DataLayout &DL) {
2706   // Verify that V is a vector type with same number of elements as DstVTy.
2707   unsigned VF = DstVTy->getNumElements();
2708   VectorType *SrcVecTy = cast<VectorType>(V->getType());
2709   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2710   Type *SrcElemTy = SrcVecTy->getElementType();
2711   Type *DstElemTy = DstVTy->getElementType();
2712   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2713          "Vector elements must have same size");
2714 
2715   // Do a direct cast if element types are castable.
2716   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2717     return Builder.CreateBitOrPointerCast(V, DstVTy);
2718   }
2719   // V cannot be directly casted to desired vector type.
2720   // May happen when V is a floating point vector but DstVTy is a vector of
2721   // pointers or vice-versa. Handle this using a two-step bitcast using an
2722   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2723   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2724          "Only one type should be a pointer type");
2725   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2726          "Only one type should be a floating point type");
2727   Type *IntTy =
2728       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2729   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
2730   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2731   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2732 }
2733 
2734 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2735                                                          BasicBlock *Bypass) {
2736   Value *Count = getOrCreateTripCount(L);
2737   // Reuse existing vector loop preheader for TC checks.
2738   // Note that new preheader block is generated for vector loop.
2739   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2740   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2741 
2742   // Generate code to check if the loop's trip count is less than VF * UF, or
2743   // equal to it in case a scalar epilogue is required; this implies that the
2744   // vector trip count is zero. This check also covers the case where adding one
2745   // to the backedge-taken count overflowed leading to an incorrect trip count
2746   // of zero. In this case we will also jump to the scalar loop.
2747   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2748                                           : ICmpInst::ICMP_ULT;
2749 
2750   // If tail is to be folded, vector loop takes care of all iterations.
2751   Value *CheckMinIters = Builder.getFalse();
2752   if (!Cost->foldTailByMasking())
2753     CheckMinIters = Builder.CreateICmp(
2754         P, Count, ConstantInt::get(Count->getType(), VF * UF),
2755         "min.iters.check");
2756 
2757   // Create new preheader for vector loop.
2758   LoopVectorPreHeader =
2759       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2760                  "vector.ph");
2761 
2762   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2763                                DT->getNode(Bypass)->getIDom()) &&
2764          "TC check is expected to dominate Bypass");
2765 
2766   // Update dominator for Bypass & LoopExit.
2767   DT->changeImmediateDominator(Bypass, TCCheckBlock);
2768   DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
2769 
2770   ReplaceInstWithInst(
2771       TCCheckBlock->getTerminator(),
2772       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
2773   LoopBypassBlocks.push_back(TCCheckBlock);
2774 }
2775 
2776 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2777   // Reuse existing vector loop preheader for SCEV checks.
2778   // Note that new preheader block is generated for vector loop.
2779   BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
2780 
2781   // Generate the code to check that the SCEV assumptions that we made.
2782   // We want the new basic block to start at the first instruction in a
2783   // sequence of instructions that form a check.
2784   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2785                    "scev.check");
2786   Value *SCEVCheck = Exp.expandCodeForPredicate(
2787       &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
2788 
2789   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2790     if (C->isZero())
2791       return;
2792 
2793   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2794            OptForSizeBasedOnProfile) &&
2795          "Cannot SCEV check stride or overflow when optimizing for size");
2796 
2797   SCEVCheckBlock->setName("vector.scevcheck");
2798   // Create new preheader for vector loop.
2799   LoopVectorPreHeader =
2800       SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
2801                  nullptr, "vector.ph");
2802 
2803   // Update dominator only if this is first RT check.
2804   if (LoopBypassBlocks.empty()) {
2805     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2806     DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2807   }
2808 
2809   ReplaceInstWithInst(
2810       SCEVCheckBlock->getTerminator(),
2811       BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
2812   LoopBypassBlocks.push_back(SCEVCheckBlock);
2813   AddedSafetyChecks = true;
2814 }
2815 
2816 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2817   // VPlan-native path does not do any analysis for runtime checks currently.
2818   if (EnableVPlanNativePath)
2819     return;
2820 
2821   // Reuse existing vector loop preheader for runtime memory checks.
2822   // Note that new preheader block is generated for vector loop.
2823   BasicBlock *const MemCheckBlock = L->getLoopPreheader();
2824 
2825   // Generate the code that checks in runtime if arrays overlap. We put the
2826   // checks into a separate block to make the more common case of few elements
2827   // faster.
2828   auto *LAI = Legal->getLAI();
2829   const auto &RtPtrChecking = *LAI->getRuntimePointerChecking();
2830   if (!RtPtrChecking.Need)
2831     return;
2832   Instruction *FirstCheckInst;
2833   Instruction *MemRuntimeCheck;
2834   std::tie(FirstCheckInst, MemRuntimeCheck) =
2835       addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop,
2836                        RtPtrChecking.getChecks(), RtPtrChecking.getSE());
2837   assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
2838                             "claimed checks are required");
2839 
2840   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
2841     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2842            "Cannot emit memory checks when optimizing for size, unless forced "
2843            "to vectorize.");
2844     ORE->emit([&]() {
2845       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2846                                         L->getStartLoc(), L->getHeader())
2847              << "Code-size may be reduced by not forcing "
2848                 "vectorization, or by source-code modifications "
2849                 "eliminating the need for runtime checks "
2850                 "(e.g., adding 'restrict').";
2851     });
2852   }
2853 
2854   MemCheckBlock->setName("vector.memcheck");
2855   // Create new preheader for vector loop.
2856   LoopVectorPreHeader =
2857       SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
2858                  "vector.ph");
2859 
2860   // Update dominator only if this is first RT check.
2861   if (LoopBypassBlocks.empty()) {
2862     DT->changeImmediateDominator(Bypass, MemCheckBlock);
2863     DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
2864   }
2865 
2866   ReplaceInstWithInst(
2867       MemCheckBlock->getTerminator(),
2868       BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck));
2869   LoopBypassBlocks.push_back(MemCheckBlock);
2870   AddedSafetyChecks = true;
2871 
2872   // We currently don't use LoopVersioning for the actual loop cloning but we
2873   // still use it to add the noalias metadata.
2874   LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2875                                           PSE.getSE());
2876   LVer->prepareNoAliasMetadata();
2877 }
2878 
2879 Value *InnerLoopVectorizer::emitTransformedIndex(
2880     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2881     const InductionDescriptor &ID) const {
2882 
2883   SCEVExpander Exp(*SE, DL, "induction");
2884   auto Step = ID.getStep();
2885   auto StartValue = ID.getStartValue();
2886   assert(Index->getType() == Step->getType() &&
2887          "Index type does not match StepValue type");
2888 
2889   // Note: the IR at this point is broken. We cannot use SE to create any new
2890   // SCEV and then expand it, hoping that SCEV's simplification will give us
2891   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2892   // lead to various SCEV crashes. So all we can do is to use builder and rely
2893   // on InstCombine for future simplifications. Here we handle some trivial
2894   // cases only.
2895   auto CreateAdd = [&B](Value *X, Value *Y) {
2896     assert(X->getType() == Y->getType() && "Types don't match!");
2897     if (auto *CX = dyn_cast<ConstantInt>(X))
2898       if (CX->isZero())
2899         return Y;
2900     if (auto *CY = dyn_cast<ConstantInt>(Y))
2901       if (CY->isZero())
2902         return X;
2903     return B.CreateAdd(X, Y);
2904   };
2905 
2906   auto CreateMul = [&B](Value *X, Value *Y) {
2907     assert(X->getType() == Y->getType() && "Types don't match!");
2908     if (auto *CX = dyn_cast<ConstantInt>(X))
2909       if (CX->isOne())
2910         return Y;
2911     if (auto *CY = dyn_cast<ConstantInt>(Y))
2912       if (CY->isOne())
2913         return X;
2914     return B.CreateMul(X, Y);
2915   };
2916 
2917   // Get a suitable insert point for SCEV expansion. For blocks in the vector
2918   // loop, choose the end of the vector loop header (=LoopVectorBody), because
2919   // the DomTree is not kept up-to-date for additional blocks generated in the
2920   // vector loop. By using the header as insertion point, we guarantee that the
2921   // expanded instructions dominate all their uses.
2922   auto GetInsertPoint = [this, &B]() {
2923     BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
2924     if (InsertBB != LoopVectorBody &&
2925         LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
2926       return LoopVectorBody->getTerminator();
2927     return &*B.GetInsertPoint();
2928   };
2929   switch (ID.getKind()) {
2930   case InductionDescriptor::IK_IntInduction: {
2931     assert(Index->getType() == StartValue->getType() &&
2932            "Index type does not match StartValue type");
2933     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
2934       return B.CreateSub(StartValue, Index);
2935     auto *Offset = CreateMul(
2936         Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
2937     return CreateAdd(StartValue, Offset);
2938   }
2939   case InductionDescriptor::IK_PtrInduction: {
2940     assert(isa<SCEVConstant>(Step) &&
2941            "Expected constant step for pointer induction");
2942     return B.CreateGEP(
2943         StartValue->getType()->getPointerElementType(), StartValue,
2944         CreateMul(Index,
2945                   Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())));
2946   }
2947   case InductionDescriptor::IK_FpInduction: {
2948     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2949     auto InductionBinOp = ID.getInductionBinOp();
2950     assert(InductionBinOp &&
2951            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2952             InductionBinOp->getOpcode() == Instruction::FSub) &&
2953            "Original bin op should be defined for FP induction");
2954 
2955     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2956 
2957     // Floating point operations had to be 'fast' to enable the induction.
2958     FastMathFlags Flags;
2959     Flags.setFast();
2960 
2961     Value *MulExp = B.CreateFMul(StepValue, Index);
2962     if (isa<Instruction>(MulExp))
2963       // We have to check, the MulExp may be a constant.
2964       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2965 
2966     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2967                                "induction");
2968     if (isa<Instruction>(BOp))
2969       cast<Instruction>(BOp)->setFastMathFlags(Flags);
2970 
2971     return BOp;
2972   }
2973   case InductionDescriptor::IK_NoInduction:
2974     return nullptr;
2975   }
2976   llvm_unreachable("invalid enum");
2977 }
2978 
2979 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
2980   LoopScalarBody = OrigLoop->getHeader();
2981   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
2982   LoopExitBlock = OrigLoop->getExitBlock();
2983   assert(LoopExitBlock && "Must have an exit block");
2984   assert(LoopVectorPreHeader && "Invalid loop structure");
2985 
2986   LoopMiddleBlock =
2987       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
2988                  LI, nullptr, Twine(Prefix) + "middle.block");
2989   LoopScalarPreHeader =
2990       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
2991                  nullptr, Twine(Prefix) + "scalar.ph");
2992   // We intentionally don't let SplitBlock to update LoopInfo since
2993   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
2994   // LoopVectorBody is explicitly added to the correct place few lines later.
2995   LoopVectorBody =
2996       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
2997                  nullptr, nullptr, Twine(Prefix) + "vector.body");
2998 
2999   // Update dominator for loop exit.
3000   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3001 
3002   // Create and register the new vector loop.
3003   Loop *Lp = LI->AllocateLoop();
3004   Loop *ParentLoop = OrigLoop->getParentLoop();
3005 
3006   // Insert the new loop into the loop nest and register the new basic blocks
3007   // before calling any utilities such as SCEV that require valid LoopInfo.
3008   if (ParentLoop) {
3009     ParentLoop->addChildLoop(Lp);
3010   } else {
3011     LI->addTopLevelLoop(Lp);
3012   }
3013   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3014   return Lp;
3015 }
3016 
3017 void InnerLoopVectorizer::createInductionResumeValues(Loop *L,
3018                                                       Value *VectorTripCount) {
3019   assert(VectorTripCount && L && "Expected valid arguments");
3020   // We are going to resume the execution of the scalar loop.
3021   // Go over all of the induction variables that we found and fix the
3022   // PHIs that are left in the scalar version of the loop.
3023   // The starting values of PHI nodes depend on the counter of the last
3024   // iteration in the vectorized loop.
3025   // If we come from a bypass edge then we need to start from the original
3026   // start value.
3027   for (auto &InductionEntry : Legal->getInductionVars()) {
3028     PHINode *OrigPhi = InductionEntry.first;
3029     InductionDescriptor II = InductionEntry.second;
3030 
3031     // Create phi nodes to merge from the  backedge-taken check block.
3032     PHINode *BCResumeVal =
3033         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3034                         LoopScalarPreHeader->getTerminator());
3035     // Copy original phi DL over to the new one.
3036     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3037     Value *&EndValue = IVEndValues[OrigPhi];
3038     if (OrigPhi == OldInduction) {
3039       // We know what the end value is.
3040       EndValue = VectorTripCount;
3041     } else {
3042       IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3043       Type *StepType = II.getStep()->getType();
3044       Instruction::CastOps CastOp =
3045           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3046       Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3047       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3048       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3049       EndValue->setName("ind.end");
3050     }
3051 
3052     // The new PHI merges the original incoming value, in case of a bypass,
3053     // or the value at the end of the vectorized loop.
3054     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3055 
3056     // Fix the scalar body counter (PHI node).
3057     // The old induction's phi node in the scalar body needs the truncated
3058     // value.
3059     for (BasicBlock *BB : LoopBypassBlocks)
3060       BCResumeVal->addIncoming(II.getStartValue(), BB);
3061     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3062   }
3063 }
3064 
3065 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3066                                                       MDNode *OrigLoopID) {
3067   assert(L && "Expected valid loop.");
3068 
3069   // The trip counts should be cached by now.
3070   Value *Count = getOrCreateTripCount(L);
3071   Value *VectorTripCount = getOrCreateVectorTripCount(L);
3072 
3073   // We need the OrigLoop (scalar loop part) latch terminator to help
3074   // produce correct debug info for the middle block BB instructions.
3075   // The legality check stage guarantees that the loop will have a single
3076   // latch.
3077   assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
3078          "Scalar loop latch terminator isn't a branch");
3079   BranchInst *ScalarLatchBr =
3080       cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3081 
3082   // Add a check in the middle block to see if we have completed
3083   // all of the iterations in the first vector loop.
3084   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3085   // If tail is to be folded, we know we don't need to run the remainder.
3086   Value *CmpN = Builder.getTrue();
3087   if (!Cost->foldTailByMasking()) {
3088     CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3089                            VectorTripCount, "cmp.n",
3090                            LoopMiddleBlock->getTerminator());
3091 
3092     // Here we use the same DebugLoc as the scalar loop latch branch instead
3093     // of the corresponding compare because they may have ended up with
3094     // different line numbers and we want to avoid awkward line stepping while
3095     // debugging. Eg. if the compare has got a line number inside the loop.
3096     cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3097   }
3098 
3099   BranchInst *BrInst =
3100       BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN);
3101   BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3102   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3103 
3104   // Get ready to start creating new instructions into the vectorized body.
3105   assert(LoopVectorPreHeader == L->getLoopPreheader() &&
3106          "Inconsistent vector loop preheader");
3107   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3108 
3109   Optional<MDNode *> VectorizedLoopID =
3110       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3111                                       LLVMLoopVectorizeFollowupVectorized});
3112   if (VectorizedLoopID.hasValue()) {
3113     L->setLoopID(VectorizedLoopID.getValue());
3114 
3115     // Do not setAlreadyVectorized if loop attributes have been defined
3116     // explicitly.
3117     return LoopVectorPreHeader;
3118   }
3119 
3120   // Keep all loop hints from the original loop on the vector loop (we'll
3121   // replace the vectorizer-specific hints below).
3122   if (MDNode *LID = OrigLoop->getLoopID())
3123     L->setLoopID(LID);
3124 
3125   LoopVectorizeHints Hints(L, true, *ORE);
3126   Hints.setAlreadyVectorized();
3127 
3128 #ifdef EXPENSIVE_CHECKS
3129   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3130   LI->verify(*DT);
3131 #endif
3132 
3133   return LoopVectorPreHeader;
3134 }
3135 
3136 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3137   /*
3138    In this function we generate a new loop. The new loop will contain
3139    the vectorized instructions while the old loop will continue to run the
3140    scalar remainder.
3141 
3142        [ ] <-- loop iteration number check.
3143     /   |
3144    /    v
3145   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3146   |  /  |
3147   | /   v
3148   ||   [ ]     <-- vector pre header.
3149   |/    |
3150   |     v
3151   |    [  ] \
3152   |    [  ]_|   <-- vector loop.
3153   |     |
3154   |     v
3155   |   -[ ]   <--- middle-block.
3156   |  /  |
3157   | /   v
3158   -|- >[ ]     <--- new preheader.
3159    |    |
3160    |    v
3161    |   [ ] \
3162    |   [ ]_|   <-- old scalar loop to handle remainder.
3163     \   |
3164      \  v
3165       >[ ]     <-- exit block.
3166    ...
3167    */
3168 
3169   // Get the metadata of the original loop before it gets modified.
3170   MDNode *OrigLoopID = OrigLoop->getLoopID();
3171 
3172   // Create an empty vector loop, and prepare basic blocks for the runtime
3173   // checks.
3174   Loop *Lp = createVectorLoopSkeleton("");
3175 
3176   // Now, compare the new count to zero. If it is zero skip the vector loop and
3177   // jump to the scalar loop. This check also covers the case where the
3178   // backedge-taken count is uint##_max: adding one to it will overflow leading
3179   // to an incorrect trip count of zero. In this (rare) case we will also jump
3180   // to the scalar loop.
3181   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3182 
3183   // Generate the code to check any assumptions that we've made for SCEV
3184   // expressions.
3185   emitSCEVChecks(Lp, LoopScalarPreHeader);
3186 
3187   // Generate the code that checks in runtime if arrays overlap. We put the
3188   // checks into a separate block to make the more common case of few elements
3189   // faster.
3190   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3191 
3192   // Some loops have a single integer induction variable, while other loops
3193   // don't. One example is c++ iterators that often have multiple pointer
3194   // induction variables. In the code below we also support a case where we
3195   // don't have a single induction variable.
3196   //
3197   // We try to obtain an induction variable from the original loop as hard
3198   // as possible. However if we don't find one that:
3199   //   - is an integer
3200   //   - counts from zero, stepping by one
3201   //   - is the size of the widest induction variable type
3202   // then we create a new one.
3203   OldInduction = Legal->getPrimaryInduction();
3204   Type *IdxTy = Legal->getWidestInductionType();
3205   Value *StartIdx = ConstantInt::get(IdxTy, 0);
3206   // The loop step is equal to the vectorization factor (num of SIMD elements)
3207   // times the unroll factor (num of SIMD instructions).
3208   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
3209   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3210   Induction =
3211       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3212                               getDebugLocFromInstOrOperands(OldInduction));
3213 
3214   // Emit phis for the new starting index of the scalar loop.
3215   createInductionResumeValues(Lp, CountRoundDown);
3216 
3217   return completeLoopSkeleton(Lp, OrigLoopID);
3218 }
3219 
3220 // Fix up external users of the induction variable. At this point, we are
3221 // in LCSSA form, with all external PHIs that use the IV having one input value,
3222 // coming from the remainder loop. We need those PHIs to also have a correct
3223 // value for the IV when arriving directly from the middle block.
3224 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3225                                        const InductionDescriptor &II,
3226                                        Value *CountRoundDown, Value *EndValue,
3227                                        BasicBlock *MiddleBlock) {
3228   // There are two kinds of external IV usages - those that use the value
3229   // computed in the last iteration (the PHI) and those that use the penultimate
3230   // value (the value that feeds into the phi from the loop latch).
3231   // We allow both, but they, obviously, have different values.
3232 
3233   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3234 
3235   DenseMap<Value *, Value *> MissingVals;
3236 
3237   // An external user of the last iteration's value should see the value that
3238   // the remainder loop uses to initialize its own IV.
3239   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3240   for (User *U : PostInc->users()) {
3241     Instruction *UI = cast<Instruction>(U);
3242     if (!OrigLoop->contains(UI)) {
3243       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3244       MissingVals[UI] = EndValue;
3245     }
3246   }
3247 
3248   // An external user of the penultimate value need to see EndValue - Step.
3249   // The simplest way to get this is to recompute it from the constituent SCEVs,
3250   // that is Start + (Step * (CRD - 1)).
3251   for (User *U : OrigPhi->users()) {
3252     auto *UI = cast<Instruction>(U);
3253     if (!OrigLoop->contains(UI)) {
3254       const DataLayout &DL =
3255           OrigLoop->getHeader()->getModule()->getDataLayout();
3256       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3257 
3258       IRBuilder<> B(MiddleBlock->getTerminator());
3259       Value *CountMinusOne = B.CreateSub(
3260           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3261       Value *CMO =
3262           !II.getStep()->getType()->isIntegerTy()
3263               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3264                              II.getStep()->getType())
3265               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3266       CMO->setName("cast.cmo");
3267       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3268       Escape->setName("ind.escape");
3269       MissingVals[UI] = Escape;
3270     }
3271   }
3272 
3273   for (auto &I : MissingVals) {
3274     PHINode *PHI = cast<PHINode>(I.first);
3275     // One corner case we have to handle is two IVs "chasing" each-other,
3276     // that is %IV2 = phi [...], [ %IV1, %latch ]
3277     // In this case, if IV1 has an external use, we need to avoid adding both
3278     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3279     // don't already have an incoming value for the middle block.
3280     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3281       PHI->addIncoming(I.second, MiddleBlock);
3282   }
3283 }
3284 
3285 namespace {
3286 
3287 struct CSEDenseMapInfo {
3288   static bool canHandle(const Instruction *I) {
3289     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3290            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3291   }
3292 
3293   static inline Instruction *getEmptyKey() {
3294     return DenseMapInfo<Instruction *>::getEmptyKey();
3295   }
3296 
3297   static inline Instruction *getTombstoneKey() {
3298     return DenseMapInfo<Instruction *>::getTombstoneKey();
3299   }
3300 
3301   static unsigned getHashValue(const Instruction *I) {
3302     assert(canHandle(I) && "Unknown instruction!");
3303     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3304                                                            I->value_op_end()));
3305   }
3306 
3307   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3308     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3309         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3310       return LHS == RHS;
3311     return LHS->isIdenticalTo(RHS);
3312   }
3313 };
3314 
3315 } // end anonymous namespace
3316 
3317 ///Perform cse of induction variable instructions.
3318 static void cse(BasicBlock *BB) {
3319   // Perform simple cse.
3320   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3321   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3322     Instruction *In = &*I++;
3323 
3324     if (!CSEDenseMapInfo::canHandle(In))
3325       continue;
3326 
3327     // Check if we can replace this instruction with any of the
3328     // visited instructions.
3329     if (Instruction *V = CSEMap.lookup(In)) {
3330       In->replaceAllUsesWith(V);
3331       In->eraseFromParent();
3332       continue;
3333     }
3334 
3335     CSEMap[In] = In;
3336   }
3337 }
3338 
3339 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3340                                                        unsigned VF,
3341                                                        bool &NeedToScalarize) {
3342   Function *F = CI->getCalledFunction();
3343   Type *ScalarRetTy = CI->getType();
3344   SmallVector<Type *, 4> Tys, ScalarTys;
3345   for (auto &ArgOp : CI->arg_operands())
3346     ScalarTys.push_back(ArgOp->getType());
3347 
3348   // Estimate cost of scalarized vector call. The source operands are assumed
3349   // to be vectors, so we need to extract individual elements from there,
3350   // execute VF scalar calls, and then gather the result into the vector return
3351   // value.
3352   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys,
3353                                                  TTI::TCK_RecipThroughput);
3354   if (VF == 1)
3355     return ScalarCallCost;
3356 
3357   // Compute corresponding vector type for return value and arguments.
3358   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3359   for (Type *ScalarTy : ScalarTys)
3360     Tys.push_back(ToVectorTy(ScalarTy, VF));
3361 
3362   // Compute costs of unpacking argument values for the scalar calls and
3363   // packing the return values to a vector.
3364   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3365 
3366   unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3367 
3368   // If we can't emit a vector call for this function, then the currently found
3369   // cost is the cost we need to return.
3370   NeedToScalarize = true;
3371   VFShape Shape = VFShape::get(*CI, {VF, false}, false /*HasGlobalPred*/);
3372   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3373 
3374   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3375     return Cost;
3376 
3377   // If the corresponding vector cost is cheaper, return its cost.
3378   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys,
3379                                                  TTI::TCK_RecipThroughput);
3380   if (VectorCallCost < Cost) {
3381     NeedToScalarize = false;
3382     return VectorCallCost;
3383   }
3384   return Cost;
3385 }
3386 
3387 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3388                                                             unsigned VF) {
3389   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3390   assert(ID && "Expected intrinsic call!");
3391 
3392   IntrinsicCostAttributes CostAttrs(ID, *CI, VF);
3393   return TTI.getIntrinsicInstrCost(CostAttrs,
3394                                    TargetTransformInfo::TCK_RecipThroughput);
3395 }
3396 
3397 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3398   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3399   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3400   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3401 }
3402 
3403 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3404   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3405   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3406   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3407 }
3408 
3409 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3410   // For every instruction `I` in MinBWs, truncate the operands, create a
3411   // truncated version of `I` and reextend its result. InstCombine runs
3412   // later and will remove any ext/trunc pairs.
3413   SmallPtrSet<Value *, 4> Erased;
3414   for (const auto &KV : Cost->getMinimalBitwidths()) {
3415     // If the value wasn't vectorized, we must maintain the original scalar
3416     // type. The absence of the value from VectorLoopValueMap indicates that it
3417     // wasn't vectorized.
3418     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3419       continue;
3420     for (unsigned Part = 0; Part < UF; ++Part) {
3421       Value *I = getOrCreateVectorValue(KV.first, Part);
3422       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3423         continue;
3424       Type *OriginalTy = I->getType();
3425       Type *ScalarTruncatedTy =
3426           IntegerType::get(OriginalTy->getContext(), KV.second);
3427       auto *TruncatedTy = FixedVectorType::get(
3428           ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getNumElements());
3429       if (TruncatedTy == OriginalTy)
3430         continue;
3431 
3432       IRBuilder<> B(cast<Instruction>(I));
3433       auto ShrinkOperand = [&](Value *V) -> Value * {
3434         if (auto *ZI = dyn_cast<ZExtInst>(V))
3435           if (ZI->getSrcTy() == TruncatedTy)
3436             return ZI->getOperand(0);
3437         return B.CreateZExtOrTrunc(V, TruncatedTy);
3438       };
3439 
3440       // The actual instruction modification depends on the instruction type,
3441       // unfortunately.
3442       Value *NewI = nullptr;
3443       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3444         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3445                              ShrinkOperand(BO->getOperand(1)));
3446 
3447         // Any wrapping introduced by shrinking this operation shouldn't be
3448         // considered undefined behavior. So, we can't unconditionally copy
3449         // arithmetic wrapping flags to NewI.
3450         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3451       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3452         NewI =
3453             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3454                          ShrinkOperand(CI->getOperand(1)));
3455       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3456         NewI = B.CreateSelect(SI->getCondition(),
3457                               ShrinkOperand(SI->getTrueValue()),
3458                               ShrinkOperand(SI->getFalseValue()));
3459       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3460         switch (CI->getOpcode()) {
3461         default:
3462           llvm_unreachable("Unhandled cast!");
3463         case Instruction::Trunc:
3464           NewI = ShrinkOperand(CI->getOperand(0));
3465           break;
3466         case Instruction::SExt:
3467           NewI = B.CreateSExtOrTrunc(
3468               CI->getOperand(0),
3469               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3470           break;
3471         case Instruction::ZExt:
3472           NewI = B.CreateZExtOrTrunc(
3473               CI->getOperand(0),
3474               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3475           break;
3476         }
3477       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3478         auto Elements0 =
3479             cast<VectorType>(SI->getOperand(0)->getType())->getNumElements();
3480         auto *O0 = B.CreateZExtOrTrunc(
3481             SI->getOperand(0),
3482             FixedVectorType::get(ScalarTruncatedTy, Elements0));
3483         auto Elements1 =
3484             cast<VectorType>(SI->getOperand(1)->getType())->getNumElements();
3485         auto *O1 = B.CreateZExtOrTrunc(
3486             SI->getOperand(1),
3487             FixedVectorType::get(ScalarTruncatedTy, Elements1));
3488 
3489         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3490       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3491         // Don't do anything with the operands, just extend the result.
3492         continue;
3493       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3494         auto Elements =
3495             cast<VectorType>(IE->getOperand(0)->getType())->getNumElements();
3496         auto *O0 = B.CreateZExtOrTrunc(
3497             IE->getOperand(0),
3498             FixedVectorType::get(ScalarTruncatedTy, Elements));
3499         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3500         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3501       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3502         auto Elements =
3503             cast<VectorType>(EE->getOperand(0)->getType())->getNumElements();
3504         auto *O0 = B.CreateZExtOrTrunc(
3505             EE->getOperand(0),
3506             FixedVectorType::get(ScalarTruncatedTy, Elements));
3507         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3508       } else {
3509         // If we don't know what to do, be conservative and don't do anything.
3510         continue;
3511       }
3512 
3513       // Lastly, extend the result.
3514       NewI->takeName(cast<Instruction>(I));
3515       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3516       I->replaceAllUsesWith(Res);
3517       cast<Instruction>(I)->eraseFromParent();
3518       Erased.insert(I);
3519       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3520     }
3521   }
3522 
3523   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3524   for (const auto &KV : Cost->getMinimalBitwidths()) {
3525     // If the value wasn't vectorized, we must maintain the original scalar
3526     // type. The absence of the value from VectorLoopValueMap indicates that it
3527     // wasn't vectorized.
3528     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3529       continue;
3530     for (unsigned Part = 0; Part < UF; ++Part) {
3531       Value *I = getOrCreateVectorValue(KV.first, Part);
3532       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3533       if (Inst && Inst->use_empty()) {
3534         Value *NewI = Inst->getOperand(0);
3535         Inst->eraseFromParent();
3536         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3537       }
3538     }
3539   }
3540 }
3541 
3542 void InnerLoopVectorizer::fixVectorizedLoop() {
3543   // Insert truncates and extends for any truncated instructions as hints to
3544   // InstCombine.
3545   if (VF > 1)
3546     truncateToMinimalBitwidths();
3547 
3548   // Fix widened non-induction PHIs by setting up the PHI operands.
3549   if (OrigPHIsToFix.size()) {
3550     assert(EnableVPlanNativePath &&
3551            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3552     fixNonInductionPHIs();
3553   }
3554 
3555   // At this point every instruction in the original loop is widened to a
3556   // vector form. Now we need to fix the recurrences in the loop. These PHI
3557   // nodes are currently empty because we did not want to introduce cycles.
3558   // This is the second stage of vectorizing recurrences.
3559   fixCrossIterationPHIs();
3560 
3561   // Forget the original basic block.
3562   PSE.getSE()->forgetLoop(OrigLoop);
3563 
3564   // Fix-up external users of the induction variables.
3565   for (auto &Entry : Legal->getInductionVars())
3566     fixupIVUsers(Entry.first, Entry.second,
3567                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3568                  IVEndValues[Entry.first], LoopMiddleBlock);
3569 
3570   fixLCSSAPHIs();
3571   for (Instruction *PI : PredicatedInstructions)
3572     sinkScalarOperands(&*PI);
3573 
3574   // Remove redundant induction instructions.
3575   cse(LoopVectorBody);
3576 
3577   // Set/update profile weights for the vector and remainder loops as original
3578   // loop iterations are now distributed among them. Note that original loop
3579   // represented by LoopScalarBody becomes remainder loop after vectorization.
3580   //
3581   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3582   // end up getting slightly roughened result but that should be OK since
3583   // profile is not inherently precise anyway. Note also possible bypass of
3584   // vector code caused by legality checks is ignored, assigning all the weight
3585   // to the vector loop, optimistically.
3586   setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody),
3587                                LI->getLoopFor(LoopVectorBody),
3588                                LI->getLoopFor(LoopScalarBody), VF * UF);
3589 }
3590 
3591 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3592   // In order to support recurrences we need to be able to vectorize Phi nodes.
3593   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3594   // stage #2: We now need to fix the recurrences by adding incoming edges to
3595   // the currently empty PHI nodes. At this point every instruction in the
3596   // original loop is widened to a vector form so we can use them to construct
3597   // the incoming edges.
3598   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3599     // Handle first-order recurrences and reductions that need to be fixed.
3600     if (Legal->isFirstOrderRecurrence(&Phi))
3601       fixFirstOrderRecurrence(&Phi);
3602     else if (Legal->isReductionVariable(&Phi))
3603       fixReduction(&Phi);
3604   }
3605 }
3606 
3607 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3608   // This is the second phase of vectorizing first-order recurrences. An
3609   // overview of the transformation is described below. Suppose we have the
3610   // following loop.
3611   //
3612   //   for (int i = 0; i < n; ++i)
3613   //     b[i] = a[i] - a[i - 1];
3614   //
3615   // There is a first-order recurrence on "a". For this loop, the shorthand
3616   // scalar IR looks like:
3617   //
3618   //   scalar.ph:
3619   //     s_init = a[-1]
3620   //     br scalar.body
3621   //
3622   //   scalar.body:
3623   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3624   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3625   //     s2 = a[i]
3626   //     b[i] = s2 - s1
3627   //     br cond, scalar.body, ...
3628   //
3629   // In this example, s1 is a recurrence because it's value depends on the
3630   // previous iteration. In the first phase of vectorization, we created a
3631   // temporary value for s1. We now complete the vectorization and produce the
3632   // shorthand vector IR shown below (for VF = 4, UF = 1).
3633   //
3634   //   vector.ph:
3635   //     v_init = vector(..., ..., ..., a[-1])
3636   //     br vector.body
3637   //
3638   //   vector.body
3639   //     i = phi [0, vector.ph], [i+4, vector.body]
3640   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3641   //     v2 = a[i, i+1, i+2, i+3];
3642   //     v3 = vector(v1(3), v2(0, 1, 2))
3643   //     b[i, i+1, i+2, i+3] = v2 - v3
3644   //     br cond, vector.body, middle.block
3645   //
3646   //   middle.block:
3647   //     x = v2(3)
3648   //     br scalar.ph
3649   //
3650   //   scalar.ph:
3651   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3652   //     br scalar.body
3653   //
3654   // After execution completes the vector loop, we extract the next value of
3655   // the recurrence (x) to use as the initial value in the scalar loop.
3656 
3657   // Get the original loop preheader and single loop latch.
3658   auto *Preheader = OrigLoop->getLoopPreheader();
3659   auto *Latch = OrigLoop->getLoopLatch();
3660 
3661   // Get the initial and previous values of the scalar recurrence.
3662   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3663   auto *Previous = Phi->getIncomingValueForBlock(Latch);
3664 
3665   // Create a vector from the initial value.
3666   auto *VectorInit = ScalarInit;
3667   if (VF > 1) {
3668     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3669     VectorInit = Builder.CreateInsertElement(
3670         UndefValue::get(FixedVectorType::get(VectorInit->getType(), VF)),
3671         VectorInit, Builder.getInt32(VF - 1), "vector.recur.init");
3672   }
3673 
3674   // We constructed a temporary phi node in the first phase of vectorization.
3675   // This phi node will eventually be deleted.
3676   Builder.SetInsertPoint(
3677       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3678 
3679   // Create a phi node for the new recurrence. The current value will either be
3680   // the initial value inserted into a vector or loop-varying vector value.
3681   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3682   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3683 
3684   // Get the vectorized previous value of the last part UF - 1. It appears last
3685   // among all unrolled iterations, due to the order of their construction.
3686   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3687 
3688   // Find and set the insertion point after the previous value if it is an
3689   // instruction.
3690   BasicBlock::iterator InsertPt;
3691   // Note that the previous value may have been constant-folded so it is not
3692   // guaranteed to be an instruction in the vector loop.
3693   // FIXME: Loop invariant values do not form recurrences. We should deal with
3694   //        them earlier.
3695   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
3696     InsertPt = LoopVectorBody->getFirstInsertionPt();
3697   else {
3698     Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
3699     if (isa<PHINode>(PreviousLastPart))
3700       // If the previous value is a phi node, we should insert after all the phi
3701       // nodes in the block containing the PHI to avoid breaking basic block
3702       // verification. Note that the basic block may be different to
3703       // LoopVectorBody, in case we predicate the loop.
3704       InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
3705     else
3706       InsertPt = ++PreviousInst->getIterator();
3707   }
3708   Builder.SetInsertPoint(&*InsertPt);
3709 
3710   // We will construct a vector for the recurrence by combining the values for
3711   // the current and previous iterations. This is the required shuffle mask.
3712   SmallVector<int, 8> ShuffleMask(VF);
3713   ShuffleMask[0] = VF - 1;
3714   for (unsigned I = 1; I < VF; ++I)
3715     ShuffleMask[I] = I + VF - 1;
3716 
3717   // The vector from which to take the initial value for the current iteration
3718   // (actual or unrolled). Initially, this is the vector phi node.
3719   Value *Incoming = VecPhi;
3720 
3721   // Shuffle the current and previous vector and update the vector parts.
3722   for (unsigned Part = 0; Part < UF; ++Part) {
3723     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3724     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3725     auto *Shuffle = VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3726                                                          ShuffleMask)
3727                            : Incoming;
3728     PhiPart->replaceAllUsesWith(Shuffle);
3729     cast<Instruction>(PhiPart)->eraseFromParent();
3730     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3731     Incoming = PreviousPart;
3732   }
3733 
3734   // Fix the latch value of the new recurrence in the vector loop.
3735   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3736 
3737   // Extract the last vector element in the middle block. This will be the
3738   // initial value for the recurrence when jumping to the scalar loop.
3739   auto *ExtractForScalar = Incoming;
3740   if (VF > 1) {
3741     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3742     ExtractForScalar = Builder.CreateExtractElement(
3743         ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3744   }
3745   // Extract the second last element in the middle block if the
3746   // Phi is used outside the loop. We need to extract the phi itself
3747   // and not the last element (the phi update in the current iteration). This
3748   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3749   // when the scalar loop is not run at all.
3750   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3751   if (VF > 1)
3752     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3753         Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3754   // When loop is unrolled without vectorizing, initialize
3755   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3756   // `Incoming`. This is analogous to the vectorized case above: extracting the
3757   // second last element when VF > 1.
3758   else if (UF > 1)
3759     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3760 
3761   // Fix the initial value of the original recurrence in the scalar loop.
3762   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3763   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3764   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3765     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3766     Start->addIncoming(Incoming, BB);
3767   }
3768 
3769   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3770   Phi->setName("scalar.recur");
3771 
3772   // Finally, fix users of the recurrence outside the loop. The users will need
3773   // either the last value of the scalar recurrence or the last value of the
3774   // vector recurrence we extracted in the middle block. Since the loop is in
3775   // LCSSA form, we just need to find all the phi nodes for the original scalar
3776   // recurrence in the exit block, and then add an edge for the middle block.
3777   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3778     if (LCSSAPhi.getIncomingValue(0) == Phi) {
3779       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3780     }
3781   }
3782 }
3783 
3784 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3785   Constant *Zero = Builder.getInt32(0);
3786 
3787   // Get it's reduction variable descriptor.
3788   assert(Legal->isReductionVariable(Phi) &&
3789          "Unable to find the reduction variable");
3790   RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
3791 
3792   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3793   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3794   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3795   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3796     RdxDesc.getMinMaxRecurrenceKind();
3797   setDebugLocFromInst(Builder, ReductionStartValue);
3798 
3799   // We need to generate a reduction vector from the incoming scalar.
3800   // To do so, we need to generate the 'identity' vector and override
3801   // one of the elements with the incoming scalar reduction. We need
3802   // to do it in the vector-loop preheader.
3803   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3804 
3805   // This is the vector-clone of the value that leaves the loop.
3806   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3807 
3808   // Find the reduction identity variable. Zero for addition, or, xor,
3809   // one for multiplication, -1 for And.
3810   Value *Identity;
3811   Value *VectorStart;
3812   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3813       RK == RecurrenceDescriptor::RK_FloatMinMax) {
3814     // MinMax reduction have the start value as their identify.
3815     if (VF == 1) {
3816       VectorStart = Identity = ReductionStartValue;
3817     } else {
3818       VectorStart = Identity =
3819         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3820     }
3821   } else {
3822     // Handle other reduction kinds:
3823     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3824         RK, VecTy->getScalarType());
3825     if (VF == 1) {
3826       Identity = Iden;
3827       // This vector is the Identity vector where the first element is the
3828       // incoming scalar reduction.
3829       VectorStart = ReductionStartValue;
3830     } else {
3831       Identity = ConstantVector::getSplat({VF, false}, Iden);
3832 
3833       // This vector is the Identity vector where the first element is the
3834       // incoming scalar reduction.
3835       VectorStart =
3836         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3837     }
3838   }
3839 
3840   // Wrap flags are in general invalid after vectorization, clear them.
3841   clearReductionWrapFlags(RdxDesc);
3842 
3843   // Fix the vector-loop phi.
3844 
3845   // Reductions do not have to start at zero. They can start with
3846   // any loop invariant values.
3847   BasicBlock *Latch = OrigLoop->getLoopLatch();
3848   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3849 
3850   for (unsigned Part = 0; Part < UF; ++Part) {
3851     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3852     Value *Val = getOrCreateVectorValue(LoopVal, Part);
3853     // Make sure to add the reduction start value only to the
3854     // first unroll part.
3855     Value *StartVal = (Part == 0) ? VectorStart : Identity;
3856     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3857     cast<PHINode>(VecRdxPhi)
3858       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3859   }
3860 
3861   // Before each round, move the insertion point right between
3862   // the PHIs and the values we are going to write.
3863   // This allows us to write both PHINodes and the extractelement
3864   // instructions.
3865   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3866 
3867   setDebugLocFromInst(Builder, LoopExitInst);
3868 
3869   // If tail is folded by masking, the vector value to leave the loop should be
3870   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3871   // instead of the former.
3872   if (Cost->foldTailByMasking()) {
3873     for (unsigned Part = 0; Part < UF; ++Part) {
3874       Value *VecLoopExitInst =
3875           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3876       Value *Sel = nullptr;
3877       for (User *U : VecLoopExitInst->users()) {
3878         if (isa<SelectInst>(U)) {
3879           assert(!Sel && "Reduction exit feeding two selects");
3880           Sel = U;
3881         } else
3882           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3883       }
3884       assert(Sel && "Reduction exit feeds no select");
3885       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
3886     }
3887   }
3888 
3889   // If the vector reduction can be performed in a smaller type, we truncate
3890   // then extend the loop exit value to enable InstCombine to evaluate the
3891   // entire expression in the smaller type.
3892   if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3893     Type *RdxVecTy = FixedVectorType::get(RdxDesc.getRecurrenceType(), VF);
3894     Builder.SetInsertPoint(
3895         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
3896     VectorParts RdxParts(UF);
3897     for (unsigned Part = 0; Part < UF; ++Part) {
3898       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3899       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3900       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3901                                         : Builder.CreateZExt(Trunc, VecTy);
3902       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3903            UI != RdxParts[Part]->user_end();)
3904         if (*UI != Trunc) {
3905           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3906           RdxParts[Part] = Extnd;
3907         } else {
3908           ++UI;
3909         }
3910     }
3911     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3912     for (unsigned Part = 0; Part < UF; ++Part) {
3913       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3914       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3915     }
3916   }
3917 
3918   // Reduce all of the unrolled parts into a single vector.
3919   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3920   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
3921 
3922   // The middle block terminator has already been assigned a DebugLoc here (the
3923   // OrigLoop's single latch terminator). We want the whole middle block to
3924   // appear to execute on this line because: (a) it is all compiler generated,
3925   // (b) these instructions are always executed after evaluating the latch
3926   // conditional branch, and (c) other passes may add new predecessors which
3927   // terminate on this line. This is the easiest way to ensure we don't
3928   // accidentally cause an extra step back into the loop while debugging.
3929   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
3930   for (unsigned Part = 1; Part < UF; ++Part) {
3931     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3932     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3933       // Floating point operations had to be 'fast' to enable the reduction.
3934       ReducedPartRdx = addFastMathFlag(
3935           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
3936                               ReducedPartRdx, "bin.rdx"),
3937           RdxDesc.getFastMathFlags());
3938     else
3939       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3940                                       RdxPart);
3941   }
3942 
3943   if (VF > 1) {
3944     bool NoNaN = Legal->hasFunNoNaNAttr();
3945     ReducedPartRdx =
3946         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3947     // If the reduction can be performed in a smaller type, we need to extend
3948     // the reduction to the wider type before we branch to the original loop.
3949     if (Phi->getType() != RdxDesc.getRecurrenceType())
3950       ReducedPartRdx =
3951         RdxDesc.isSigned()
3952         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
3953         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
3954   }
3955 
3956   // Create a phi node that merges control-flow from the backedge-taken check
3957   // block and the middle block.
3958   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3959                                         LoopScalarPreHeader->getTerminator());
3960   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
3961     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3962   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3963 
3964   // Now, we need to fix the users of the reduction variable
3965   // inside and outside of the scalar remainder loop.
3966   // We know that the loop is in LCSSA form. We need to update the
3967   // PHI nodes in the exit blocks.
3968   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3969     // All PHINodes need to have a single entry edge, or two if
3970     // we already fixed them.
3971     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3972 
3973     // We found a reduction value exit-PHI. Update it with the
3974     // incoming bypass edge.
3975     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
3976       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3977   } // end of the LCSSA phi scan.
3978 
3979     // Fix the scalar loop reduction variable with the incoming reduction sum
3980     // from the vector body and from the backedge value.
3981   int IncomingEdgeBlockIdx =
3982     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
3983   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3984   // Pick the other block.
3985   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3986   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3987   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3988 }
3989 
3990 void InnerLoopVectorizer::clearReductionWrapFlags(
3991     RecurrenceDescriptor &RdxDesc) {
3992   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3993   if (RK != RecurrenceDescriptor::RK_IntegerAdd &&
3994       RK != RecurrenceDescriptor::RK_IntegerMult)
3995     return;
3996 
3997   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
3998   assert(LoopExitInstr && "null loop exit instruction");
3999   SmallVector<Instruction *, 8> Worklist;
4000   SmallPtrSet<Instruction *, 8> Visited;
4001   Worklist.push_back(LoopExitInstr);
4002   Visited.insert(LoopExitInstr);
4003 
4004   while (!Worklist.empty()) {
4005     Instruction *Cur = Worklist.pop_back_val();
4006     if (isa<OverflowingBinaryOperator>(Cur))
4007       for (unsigned Part = 0; Part < UF; ++Part) {
4008         Value *V = getOrCreateVectorValue(Cur, Part);
4009         cast<Instruction>(V)->dropPoisonGeneratingFlags();
4010       }
4011 
4012     for (User *U : Cur->users()) {
4013       Instruction *UI = cast<Instruction>(U);
4014       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4015           Visited.insert(UI).second)
4016         Worklist.push_back(UI);
4017     }
4018   }
4019 }
4020 
4021 void InnerLoopVectorizer::fixLCSSAPHIs() {
4022   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4023     if (LCSSAPhi.getNumIncomingValues() == 1) {
4024       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4025       // Non-instruction incoming values will have only one value.
4026       unsigned LastLane = 0;
4027       if (isa<Instruction>(IncomingValue))
4028           LastLane = Cost->isUniformAfterVectorization(
4029                          cast<Instruction>(IncomingValue), VF)
4030                          ? 0
4031                          : VF - 1;
4032       // Can be a loop invariant incoming value or the last scalar value to be
4033       // extracted from the vectorized loop.
4034       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4035       Value *lastIncomingValue =
4036           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
4037       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4038     }
4039   }
4040 }
4041 
4042 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4043   // The basic block and loop containing the predicated instruction.
4044   auto *PredBB = PredInst->getParent();
4045   auto *VectorLoop = LI->getLoopFor(PredBB);
4046 
4047   // Initialize a worklist with the operands of the predicated instruction.
4048   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4049 
4050   // Holds instructions that we need to analyze again. An instruction may be
4051   // reanalyzed if we don't yet know if we can sink it or not.
4052   SmallVector<Instruction *, 8> InstsToReanalyze;
4053 
4054   // Returns true if a given use occurs in the predicated block. Phi nodes use
4055   // their operands in their corresponding predecessor blocks.
4056   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4057     auto *I = cast<Instruction>(U.getUser());
4058     BasicBlock *BB = I->getParent();
4059     if (auto *Phi = dyn_cast<PHINode>(I))
4060       BB = Phi->getIncomingBlock(
4061           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4062     return BB == PredBB;
4063   };
4064 
4065   // Iteratively sink the scalarized operands of the predicated instruction
4066   // into the block we created for it. When an instruction is sunk, it's
4067   // operands are then added to the worklist. The algorithm ends after one pass
4068   // through the worklist doesn't sink a single instruction.
4069   bool Changed;
4070   do {
4071     // Add the instructions that need to be reanalyzed to the worklist, and
4072     // reset the changed indicator.
4073     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4074     InstsToReanalyze.clear();
4075     Changed = false;
4076 
4077     while (!Worklist.empty()) {
4078       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4079 
4080       // We can't sink an instruction if it is a phi node, is already in the
4081       // predicated block, is not in the loop, or may have side effects.
4082       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
4083           !VectorLoop->contains(I) || I->mayHaveSideEffects())
4084         continue;
4085 
4086       // It's legal to sink the instruction if all its uses occur in the
4087       // predicated block. Otherwise, there's nothing to do yet, and we may
4088       // need to reanalyze the instruction.
4089       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4090         InstsToReanalyze.push_back(I);
4091         continue;
4092       }
4093 
4094       // Move the instruction to the beginning of the predicated block, and add
4095       // it's operands to the worklist.
4096       I->moveBefore(&*PredBB->getFirstInsertionPt());
4097       Worklist.insert(I->op_begin(), I->op_end());
4098 
4099       // The sinking may have enabled other instructions to be sunk, so we will
4100       // need to iterate.
4101       Changed = true;
4102     }
4103   } while (Changed);
4104 }
4105 
4106 void InnerLoopVectorizer::fixNonInductionPHIs() {
4107   for (PHINode *OrigPhi : OrigPHIsToFix) {
4108     PHINode *NewPhi =
4109         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
4110     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
4111 
4112     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
4113         predecessors(OrigPhi->getParent()));
4114     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
4115         predecessors(NewPhi->getParent()));
4116     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
4117            "Scalar and Vector BB should have the same number of predecessors");
4118 
4119     // The insertion point in Builder may be invalidated by the time we get
4120     // here. Force the Builder insertion point to something valid so that we do
4121     // not run into issues during insertion point restore in
4122     // getOrCreateVectorValue calls below.
4123     Builder.SetInsertPoint(NewPhi);
4124 
4125     // The predecessor order is preserved and we can rely on mapping between
4126     // scalar and vector block predecessors.
4127     for (unsigned i = 0; i < NumIncomingValues; ++i) {
4128       BasicBlock *NewPredBB = VectorBBPredecessors[i];
4129 
4130       // When looking up the new scalar/vector values to fix up, use incoming
4131       // values from original phi.
4132       Value *ScIncV =
4133           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
4134 
4135       // Scalar incoming value may need a broadcast
4136       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
4137       NewPhi->addIncoming(NewIncV, NewPredBB);
4138     }
4139   }
4140 }
4141 
4142 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands,
4143                                    unsigned UF, unsigned VF,
4144                                    bool IsPtrLoopInvariant,
4145                                    SmallBitVector &IsIndexLoopInvariant,
4146                                    VPTransformState &State) {
4147   // Construct a vector GEP by widening the operands of the scalar GEP as
4148   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4149   // results in a vector of pointers when at least one operand of the GEP
4150   // is vector-typed. Thus, to keep the representation compact, we only use
4151   // vector-typed operands for loop-varying values.
4152 
4153   if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4154     // If we are vectorizing, but the GEP has only loop-invariant operands,
4155     // the GEP we build (by only using vector-typed operands for
4156     // loop-varying values) would be a scalar pointer. Thus, to ensure we
4157     // produce a vector of pointers, we need to either arbitrarily pick an
4158     // operand to broadcast, or broadcast a clone of the original GEP.
4159     // Here, we broadcast a clone of the original.
4160     //
4161     // TODO: If at some point we decide to scalarize instructions having
4162     //       loop-invariant operands, this special case will no longer be
4163     //       required. We would add the scalarization decision to
4164     //       collectLoopScalars() and teach getVectorValue() to broadcast
4165     //       the lane-zero scalar value.
4166     auto *Clone = Builder.Insert(GEP->clone());
4167     for (unsigned Part = 0; Part < UF; ++Part) {
4168       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4169       VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart);
4170       addMetadata(EntryPart, GEP);
4171     }
4172   } else {
4173     // If the GEP has at least one loop-varying operand, we are sure to
4174     // produce a vector of pointers. But if we are only unrolling, we want
4175     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4176     // produce with the code below will be scalar (if VF == 1) or vector
4177     // (otherwise). Note that for the unroll-only case, we still maintain
4178     // values in the vector mapping with initVector, as we do for other
4179     // instructions.
4180     for (unsigned Part = 0; Part < UF; ++Part) {
4181       // The pointer operand of the new GEP. If it's loop-invariant, we
4182       // won't broadcast it.
4183       auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0})
4184                                      : State.get(Operands.getOperand(0), Part);
4185 
4186       // Collect all the indices for the new GEP. If any index is
4187       // loop-invariant, we won't broadcast it.
4188       SmallVector<Value *, 4> Indices;
4189       for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
4190         VPValue *Operand = Operands.getOperand(I);
4191         if (IsIndexLoopInvariant[I - 1])
4192           Indices.push_back(State.get(Operand, {0, 0}));
4193         else
4194           Indices.push_back(State.get(Operand, Part));
4195       }
4196 
4197       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4198       // but it should be a vector, otherwise.
4199       auto *NewGEP =
4200           GEP->isInBounds()
4201               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4202                                           Indices)
4203               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4204       assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
4205              "NewGEP is not a pointer vector");
4206       VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP);
4207       addMetadata(NewGEP, GEP);
4208     }
4209   }
4210 }
4211 
4212 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4213                                               unsigned VF) {
4214   PHINode *P = cast<PHINode>(PN);
4215   if (EnableVPlanNativePath) {
4216     // Currently we enter here in the VPlan-native path for non-induction
4217     // PHIs where all control flow is uniform. We simply widen these PHIs.
4218     // Create a vector phi with no operands - the vector phi operands will be
4219     // set at the end of vector code generation.
4220     Type *VecTy =
4221         (VF == 1) ? PN->getType() : FixedVectorType::get(PN->getType(), VF);
4222     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4223     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
4224     OrigPHIsToFix.push_back(P);
4225 
4226     return;
4227   }
4228 
4229   assert(PN->getParent() == OrigLoop->getHeader() &&
4230          "Non-header phis should have been handled elsewhere");
4231 
4232   // In order to support recurrences we need to be able to vectorize Phi nodes.
4233   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4234   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4235   // this value when we vectorize all of the instructions that use the PHI.
4236   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4237     for (unsigned Part = 0; Part < UF; ++Part) {
4238       // This is phase one of vectorizing PHIs.
4239       Type *VecTy =
4240           (VF == 1) ? PN->getType() : FixedVectorType::get(PN->getType(), VF);
4241       Value *EntryPart = PHINode::Create(
4242           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4243       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4244     }
4245     return;
4246   }
4247 
4248   setDebugLocFromInst(Builder, P);
4249 
4250   // This PHINode must be an induction variable.
4251   // Make sure that we know about it.
4252   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4253 
4254   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4255   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4256 
4257   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4258   // which can be found from the original scalar operations.
4259   switch (II.getKind()) {
4260   case InductionDescriptor::IK_NoInduction:
4261     llvm_unreachable("Unknown induction");
4262   case InductionDescriptor::IK_IntInduction:
4263   case InductionDescriptor::IK_FpInduction:
4264     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4265   case InductionDescriptor::IK_PtrInduction: {
4266     // Handle the pointer induction variable case.
4267     assert(P->getType()->isPointerTy() && "Unexpected type.");
4268 
4269     if (Cost->isScalarAfterVectorization(P, VF)) {
4270       // This is the normalized GEP that starts counting at zero.
4271       Value *PtrInd =
4272           Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
4273       // Determine the number of scalars we need to generate for each unroll
4274       // iteration. If the instruction is uniform, we only need to generate the
4275       // first lane. Otherwise, we generate all VF values.
4276       unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
4277       for (unsigned Part = 0; Part < UF; ++Part) {
4278         for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4279           Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
4280           Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4281           Value *SclrGep =
4282               emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4283           SclrGep->setName("next.gep");
4284           VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4285         }
4286       }
4287       return;
4288     }
4289     assert(isa<SCEVConstant>(II.getStep()) &&
4290            "Induction step not a SCEV constant!");
4291     Type *PhiType = II.getStep()->getType();
4292 
4293     // Build a pointer phi
4294     Value *ScalarStartValue = II.getStartValue();
4295     Type *ScStValueType = ScalarStartValue->getType();
4296     PHINode *NewPointerPhi =
4297         PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
4298     NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4299 
4300     // A pointer induction, performed by using a gep
4301     BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4302     Instruction *InductionLoc = LoopLatch->getTerminator();
4303     const SCEV *ScalarStep = II.getStep();
4304     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4305     Value *ScalarStepValue =
4306         Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4307     Value *InductionGEP = GetElementPtrInst::Create(
4308         ScStValueType->getPointerElementType(), NewPointerPhi,
4309         Builder.CreateMul(ScalarStepValue, ConstantInt::get(PhiType, VF * UF)),
4310         "ptr.ind", InductionLoc);
4311     NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4312 
4313     // Create UF many actual address geps that use the pointer
4314     // phi as base and a vectorized version of the step value
4315     // (<step*0, ..., step*N>) as offset.
4316     for (unsigned Part = 0; Part < UF; ++Part) {
4317       SmallVector<Constant *, 8> Indices;
4318       // Create a vector of consecutive numbers from zero to VF.
4319       for (unsigned i = 0; i < VF; ++i)
4320         Indices.push_back(ConstantInt::get(PhiType, i + Part * VF));
4321       Constant *StartOffset = ConstantVector::get(Indices);
4322 
4323       Value *GEP = Builder.CreateGEP(
4324           ScStValueType->getPointerElementType(), NewPointerPhi,
4325           Builder.CreateMul(StartOffset,
4326                             Builder.CreateVectorSplat(VF, ScalarStepValue),
4327                             "vector.gep"));
4328       VectorLoopValueMap.setVectorValue(P, Part, GEP);
4329     }
4330   }
4331   }
4332 }
4333 
4334 /// A helper function for checking whether an integer division-related
4335 /// instruction may divide by zero (in which case it must be predicated if
4336 /// executed conditionally in the scalar code).
4337 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4338 /// Non-zero divisors that are non compile-time constants will not be
4339 /// converted into multiplication, so we will still end up scalarizing
4340 /// the division, but can do so w/o predication.
4341 static bool mayDivideByZero(Instruction &I) {
4342   assert((I.getOpcode() == Instruction::UDiv ||
4343           I.getOpcode() == Instruction::SDiv ||
4344           I.getOpcode() == Instruction::URem ||
4345           I.getOpcode() == Instruction::SRem) &&
4346          "Unexpected instruction");
4347   Value *Divisor = I.getOperand(1);
4348   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4349   return !CInt || CInt->isZero();
4350 }
4351 
4352 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User,
4353                                            VPTransformState &State) {
4354   switch (I.getOpcode()) {
4355   case Instruction::Call:
4356   case Instruction::Br:
4357   case Instruction::PHI:
4358   case Instruction::GetElementPtr:
4359   case Instruction::Select:
4360     llvm_unreachable("This instruction is handled by a different recipe.");
4361   case Instruction::UDiv:
4362   case Instruction::SDiv:
4363   case Instruction::SRem:
4364   case Instruction::URem:
4365   case Instruction::Add:
4366   case Instruction::FAdd:
4367   case Instruction::Sub:
4368   case Instruction::FSub:
4369   case Instruction::FNeg:
4370   case Instruction::Mul:
4371   case Instruction::FMul:
4372   case Instruction::FDiv:
4373   case Instruction::FRem:
4374   case Instruction::Shl:
4375   case Instruction::LShr:
4376   case Instruction::AShr:
4377   case Instruction::And:
4378   case Instruction::Or:
4379   case Instruction::Xor: {
4380     // Just widen unops and binops.
4381     setDebugLocFromInst(Builder, &I);
4382 
4383     for (unsigned Part = 0; Part < UF; ++Part) {
4384       SmallVector<Value *, 2> Ops;
4385       for (VPValue *VPOp : User.operands())
4386         Ops.push_back(State.get(VPOp, Part));
4387 
4388       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4389 
4390       if (auto *VecOp = dyn_cast<Instruction>(V))
4391         VecOp->copyIRFlags(&I);
4392 
4393       // Use this vector value for all users of the original instruction.
4394       VectorLoopValueMap.setVectorValue(&I, Part, V);
4395       addMetadata(V, &I);
4396     }
4397 
4398     break;
4399   }
4400   case Instruction::ICmp:
4401   case Instruction::FCmp: {
4402     // Widen compares. Generate vector compares.
4403     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4404     auto *Cmp = cast<CmpInst>(&I);
4405     setDebugLocFromInst(Builder, Cmp);
4406     for (unsigned Part = 0; Part < UF; ++Part) {
4407       Value *A = State.get(User.getOperand(0), Part);
4408       Value *B = State.get(User.getOperand(1), Part);
4409       Value *C = nullptr;
4410       if (FCmp) {
4411         // Propagate fast math flags.
4412         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4413         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4414         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4415       } else {
4416         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4417       }
4418       VectorLoopValueMap.setVectorValue(&I, Part, C);
4419       addMetadata(C, &I);
4420     }
4421 
4422     break;
4423   }
4424 
4425   case Instruction::ZExt:
4426   case Instruction::SExt:
4427   case Instruction::FPToUI:
4428   case Instruction::FPToSI:
4429   case Instruction::FPExt:
4430   case Instruction::PtrToInt:
4431   case Instruction::IntToPtr:
4432   case Instruction::SIToFP:
4433   case Instruction::UIToFP:
4434   case Instruction::Trunc:
4435   case Instruction::FPTrunc:
4436   case Instruction::BitCast: {
4437     auto *CI = cast<CastInst>(&I);
4438     setDebugLocFromInst(Builder, CI);
4439 
4440     /// Vectorize casts.
4441     Type *DestTy =
4442         (VF == 1) ? CI->getType() : FixedVectorType::get(CI->getType(), VF);
4443 
4444     for (unsigned Part = 0; Part < UF; ++Part) {
4445       Value *A = State.get(User.getOperand(0), Part);
4446       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4447       VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4448       addMetadata(Cast, &I);
4449     }
4450     break;
4451   }
4452   default:
4453     // This instruction is not vectorized by simple widening.
4454     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4455     llvm_unreachable("Unhandled instruction!");
4456   } // end of switch.
4457 }
4458 
4459 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands,
4460                                                VPTransformState &State) {
4461   assert(!isa<DbgInfoIntrinsic>(I) &&
4462          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4463   setDebugLocFromInst(Builder, &I);
4464 
4465   Module *M = I.getParent()->getParent()->getParent();
4466   auto *CI = cast<CallInst>(&I);
4467 
4468   SmallVector<Type *, 4> Tys;
4469   for (Value *ArgOperand : CI->arg_operands())
4470     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4471 
4472   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4473 
4474   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4475   // version of the instruction.
4476   // Is it beneficial to perform intrinsic call compared to lib call?
4477   bool NeedToScalarize = false;
4478   unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4479   bool UseVectorIntrinsic =
4480       ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4481   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4482          "Instruction should be scalarized elsewhere.");
4483 
4484   for (unsigned Part = 0; Part < UF; ++Part) {
4485     SmallVector<Value *, 4> Args;
4486     for (auto &I : enumerate(ArgOperands.operands())) {
4487       // Some intrinsics have a scalar argument - don't replace it with a
4488       // vector.
4489       Value *Arg;
4490       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4491         Arg = State.get(I.value(), Part);
4492       else
4493         Arg = State.get(I.value(), {0, 0});
4494       Args.push_back(Arg);
4495     }
4496 
4497     Function *VectorF;
4498     if (UseVectorIntrinsic) {
4499       // Use vector version of the intrinsic.
4500       Type *TysForDecl[] = {CI->getType()};
4501       if (VF > 1)
4502         TysForDecl[0] =
4503             FixedVectorType::get(CI->getType()->getScalarType(), VF);
4504       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4505       assert(VectorF && "Can't retrieve vector intrinsic.");
4506     } else {
4507       // Use vector version of the function call.
4508       const VFShape Shape =
4509           VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/);
4510 #ifndef NDEBUG
4511       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
4512              "Can't create vector function.");
4513 #endif
4514         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4515     }
4516       SmallVector<OperandBundleDef, 1> OpBundles;
4517       CI->getOperandBundlesAsDefs(OpBundles);
4518       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4519 
4520       if (isa<FPMathOperator>(V))
4521         V->copyFastMathFlags(CI);
4522 
4523       VectorLoopValueMap.setVectorValue(&I, Part, V);
4524       addMetadata(V, &I);
4525   }
4526 }
4527 
4528 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I,
4529                                                  VPUser &Operands,
4530                                                  bool InvariantCond,
4531                                                  VPTransformState &State) {
4532   setDebugLocFromInst(Builder, &I);
4533 
4534   // The condition can be loop invariant  but still defined inside the
4535   // loop. This means that we can't just use the original 'cond' value.
4536   // We have to take the 'vectorized' value and pick the first lane.
4537   // Instcombine will make this a no-op.
4538   auto *InvarCond =
4539       InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr;
4540 
4541   for (unsigned Part = 0; Part < UF; ++Part) {
4542     Value *Cond =
4543         InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
4544     Value *Op0 = State.get(Operands.getOperand(1), Part);
4545     Value *Op1 = State.get(Operands.getOperand(2), Part);
4546     Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
4547     VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4548     addMetadata(Sel, &I);
4549   }
4550 }
4551 
4552 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4553   // We should not collect Scalars more than once per VF. Right now, this
4554   // function is called from collectUniformsAndScalars(), which already does
4555   // this check. Collecting Scalars for VF=1 does not make any sense.
4556   assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
4557          "This function should not be visited twice for the same VF");
4558 
4559   SmallSetVector<Instruction *, 8> Worklist;
4560 
4561   // These sets are used to seed the analysis with pointers used by memory
4562   // accesses that will remain scalar.
4563   SmallSetVector<Instruction *, 8> ScalarPtrs;
4564   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4565   auto *Latch = TheLoop->getLoopLatch();
4566 
4567   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4568   // The pointer operands of loads and stores will be scalar as long as the
4569   // memory access is not a gather or scatter operation. The value operand of a
4570   // store will remain scalar if the store is scalarized.
4571   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4572     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4573     assert(WideningDecision != CM_Unknown &&
4574            "Widening decision should be ready at this moment");
4575     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4576       if (Ptr == Store->getValueOperand())
4577         return WideningDecision == CM_Scalarize;
4578     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4579            "Ptr is neither a value or pointer operand");
4580     return WideningDecision != CM_GatherScatter;
4581   };
4582 
4583   // A helper that returns true if the given value is a bitcast or
4584   // getelementptr instruction contained in the loop.
4585   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4586     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4587             isa<GetElementPtrInst>(V)) &&
4588            !TheLoop->isLoopInvariant(V);
4589   };
4590 
4591   auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
4592     if (!isa<PHINode>(Ptr) ||
4593         !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
4594       return false;
4595     auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
4596     if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
4597       return false;
4598     return isScalarUse(MemAccess, Ptr);
4599   };
4600 
4601   // A helper that evaluates a memory access's use of a pointer. If the
4602   // pointer is actually the pointer induction of a loop, it is being
4603   // inserted into Worklist. If the use will be a scalar use, and the
4604   // pointer is only used by memory accesses, we place the pointer in
4605   // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
4606   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4607     if (isScalarPtrInduction(MemAccess, Ptr)) {
4608       Worklist.insert(cast<Instruction>(Ptr));
4609       Instruction *Update = cast<Instruction>(
4610           cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
4611       Worklist.insert(Update);
4612       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
4613                         << "\n");
4614       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update
4615                         << "\n");
4616       return;
4617     }
4618     // We only care about bitcast and getelementptr instructions contained in
4619     // the loop.
4620     if (!isLoopVaryingBitCastOrGEP(Ptr))
4621       return;
4622 
4623     // If the pointer has already been identified as scalar (e.g., if it was
4624     // also identified as uniform), there's nothing to do.
4625     auto *I = cast<Instruction>(Ptr);
4626     if (Worklist.count(I))
4627       return;
4628 
4629     // If the use of the pointer will be a scalar use, and all users of the
4630     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4631     // place the pointer in PossibleNonScalarPtrs.
4632     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4633           return isa<LoadInst>(U) || isa<StoreInst>(U);
4634         }))
4635       ScalarPtrs.insert(I);
4636     else
4637       PossibleNonScalarPtrs.insert(I);
4638   };
4639 
4640   // We seed the scalars analysis with three classes of instructions: (1)
4641   // instructions marked uniform-after-vectorization and (2) bitcast,
4642   // getelementptr and (pointer) phi instructions used by memory accesses
4643   // requiring a scalar use.
4644   //
4645   // (1) Add to the worklist all instructions that have been identified as
4646   // uniform-after-vectorization.
4647   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4648 
4649   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4650   // memory accesses requiring a scalar use. The pointer operands of loads and
4651   // stores will be scalar as long as the memory accesses is not a gather or
4652   // scatter operation. The value operand of a store will remain scalar if the
4653   // store is scalarized.
4654   for (auto *BB : TheLoop->blocks())
4655     for (auto &I : *BB) {
4656       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4657         evaluatePtrUse(Load, Load->getPointerOperand());
4658       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4659         evaluatePtrUse(Store, Store->getPointerOperand());
4660         evaluatePtrUse(Store, Store->getValueOperand());
4661       }
4662     }
4663   for (auto *I : ScalarPtrs)
4664     if (!PossibleNonScalarPtrs.count(I)) {
4665       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4666       Worklist.insert(I);
4667     }
4668 
4669   // Insert the forced scalars.
4670   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4671   // induction variable when the PHI user is scalarized.
4672   auto ForcedScalar = ForcedScalars.find(VF);
4673   if (ForcedScalar != ForcedScalars.end())
4674     for (auto *I : ForcedScalar->second)
4675       Worklist.insert(I);
4676 
4677   // Expand the worklist by looking through any bitcasts and getelementptr
4678   // instructions we've already identified as scalar. This is similar to the
4679   // expansion step in collectLoopUniforms(); however, here we're only
4680   // expanding to include additional bitcasts and getelementptr instructions.
4681   unsigned Idx = 0;
4682   while (Idx != Worklist.size()) {
4683     Instruction *Dst = Worklist[Idx++];
4684     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4685       continue;
4686     auto *Src = cast<Instruction>(Dst->getOperand(0));
4687     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4688           auto *J = cast<Instruction>(U);
4689           return !TheLoop->contains(J) || Worklist.count(J) ||
4690                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4691                   isScalarUse(J, Src));
4692         })) {
4693       Worklist.insert(Src);
4694       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4695     }
4696   }
4697 
4698   // An induction variable will remain scalar if all users of the induction
4699   // variable and induction variable update remain scalar.
4700   for (auto &Induction : Legal->getInductionVars()) {
4701     auto *Ind = Induction.first;
4702     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4703 
4704     // If tail-folding is applied, the primary induction variable will be used
4705     // to feed a vector compare.
4706     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4707       continue;
4708 
4709     // Determine if all users of the induction variable are scalar after
4710     // vectorization.
4711     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4712       auto *I = cast<Instruction>(U);
4713       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4714     });
4715     if (!ScalarInd)
4716       continue;
4717 
4718     // Determine if all users of the induction variable update instruction are
4719     // scalar after vectorization.
4720     auto ScalarIndUpdate =
4721         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4722           auto *I = cast<Instruction>(U);
4723           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4724         });
4725     if (!ScalarIndUpdate)
4726       continue;
4727 
4728     // The induction variable and its update instruction will remain scalar.
4729     Worklist.insert(Ind);
4730     Worklist.insert(IndUpdate);
4731     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4732     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4733                       << "\n");
4734   }
4735 
4736   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4737 }
4738 
4739 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
4740   if (!blockNeedsPredication(I->getParent()))
4741     return false;
4742   switch(I->getOpcode()) {
4743   default:
4744     break;
4745   case Instruction::Load:
4746   case Instruction::Store: {
4747     if (!Legal->isMaskRequired(I))
4748       return false;
4749     auto *Ptr = getLoadStorePointerOperand(I);
4750     auto *Ty = getMemInstValueType(I);
4751     // We have already decided how to vectorize this instruction, get that
4752     // result.
4753     if (VF > 1) {
4754       InstWidening WideningDecision = getWideningDecision(I, VF);
4755       assert(WideningDecision != CM_Unknown &&
4756              "Widening decision should be ready at this moment");
4757       return WideningDecision == CM_Scalarize;
4758     }
4759     const Align Alignment = getLoadStoreAlignment(I);
4760     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4761                                 isLegalMaskedGather(Ty, Alignment))
4762                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4763                                 isLegalMaskedScatter(Ty, Alignment));
4764   }
4765   case Instruction::UDiv:
4766   case Instruction::SDiv:
4767   case Instruction::SRem:
4768   case Instruction::URem:
4769     return mayDivideByZero(*I);
4770   }
4771   return false;
4772 }
4773 
4774 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
4775                                                                unsigned VF) {
4776   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4777   assert(getWideningDecision(I, VF) == CM_Unknown &&
4778          "Decision should not be set yet.");
4779   auto *Group = getInterleavedAccessGroup(I);
4780   assert(Group && "Must have a group.");
4781 
4782   // If the instruction's allocated size doesn't equal it's type size, it
4783   // requires padding and will be scalarized.
4784   auto &DL = I->getModule()->getDataLayout();
4785   auto *ScalarTy = getMemInstValueType(I);
4786   if (hasIrregularType(ScalarTy, DL, VF))
4787     return false;
4788 
4789   // Check if masking is required.
4790   // A Group may need masking for one of two reasons: it resides in a block that
4791   // needs predication, or it was decided to use masking to deal with gaps.
4792   bool PredicatedAccessRequiresMasking =
4793       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4794   bool AccessWithGapsRequiresMasking =
4795       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4796   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4797     return true;
4798 
4799   // If masked interleaving is required, we expect that the user/target had
4800   // enabled it, because otherwise it either wouldn't have been created or
4801   // it should have been invalidated by the CostModel.
4802   assert(useMaskedInterleavedAccesses(TTI) &&
4803          "Masked interleave-groups for predicated accesses are not enabled.");
4804 
4805   auto *Ty = getMemInstValueType(I);
4806   const Align Alignment = getLoadStoreAlignment(I);
4807   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4808                           : TTI.isLegalMaskedStore(Ty, Alignment);
4809 }
4810 
4811 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
4812                                                                unsigned VF) {
4813   // Get and ensure we have a valid memory instruction.
4814   LoadInst *LI = dyn_cast<LoadInst>(I);
4815   StoreInst *SI = dyn_cast<StoreInst>(I);
4816   assert((LI || SI) && "Invalid memory instruction");
4817 
4818   auto *Ptr = getLoadStorePointerOperand(I);
4819 
4820   // In order to be widened, the pointer should be consecutive, first of all.
4821   if (!Legal->isConsecutivePtr(Ptr))
4822     return false;
4823 
4824   // If the instruction is a store located in a predicated block, it will be
4825   // scalarized.
4826   if (isScalarWithPredication(I))
4827     return false;
4828 
4829   // If the instruction's allocated size doesn't equal it's type size, it
4830   // requires padding and will be scalarized.
4831   auto &DL = I->getModule()->getDataLayout();
4832   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4833   if (hasIrregularType(ScalarTy, DL, VF))
4834     return false;
4835 
4836   return true;
4837 }
4838 
4839 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4840   // We should not collect Uniforms more than once per VF. Right now,
4841   // this function is called from collectUniformsAndScalars(), which
4842   // already does this check. Collecting Uniforms for VF=1 does not make any
4843   // sense.
4844 
4845   assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
4846          "This function should not be visited twice for the same VF");
4847 
4848   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4849   // not analyze again.  Uniforms.count(VF) will return 1.
4850   Uniforms[VF].clear();
4851 
4852   // We now know that the loop is vectorizable!
4853   // Collect instructions inside the loop that will remain uniform after
4854   // vectorization.
4855 
4856   // Global values, params and instructions outside of current loop are out of
4857   // scope.
4858   auto isOutOfScope = [&](Value *V) -> bool {
4859     Instruction *I = dyn_cast<Instruction>(V);
4860     return (!I || !TheLoop->contains(I));
4861   };
4862 
4863   SetVector<Instruction *> Worklist;
4864   BasicBlock *Latch = TheLoop->getLoopLatch();
4865 
4866   // Instructions that are scalar with predication must not be considered
4867   // uniform after vectorization, because that would create an erroneous
4868   // replicating region where only a single instance out of VF should be formed.
4869   // TODO: optimize such seldom cases if found important, see PR40816.
4870   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4871     if (isScalarWithPredication(I, VF)) {
4872       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4873                         << *I << "\n");
4874       return;
4875     }
4876     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4877     Worklist.insert(I);
4878   };
4879 
4880   // Start with the conditional branch. If the branch condition is an
4881   // instruction contained in the loop that is only used by the branch, it is
4882   // uniform.
4883   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4884   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4885     addToWorklistIfAllowed(Cmp);
4886 
4887   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4888   // are pointers that are treated like consecutive pointers during
4889   // vectorization. The pointer operands of interleaved accesses are an
4890   // example.
4891   SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4892 
4893   // Holds pointer operands of instructions that are possibly non-uniform.
4894   SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4895 
4896   auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4897     InstWidening WideningDecision = getWideningDecision(I, VF);
4898     assert(WideningDecision != CM_Unknown &&
4899            "Widening decision should be ready at this moment");
4900 
4901     return (WideningDecision == CM_Widen ||
4902             WideningDecision == CM_Widen_Reverse ||
4903             WideningDecision == CM_Interleave);
4904   };
4905   // Iterate over the instructions in the loop, and collect all
4906   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4907   // that a consecutive-like pointer operand will be scalarized, we collect it
4908   // in PossibleNonUniformPtrs instead. We use two sets here because a single
4909   // getelementptr instruction can be used by both vectorized and scalarized
4910   // memory instructions. For example, if a loop loads and stores from the same
4911   // location, but the store is conditional, the store will be scalarized, and
4912   // the getelementptr won't remain uniform.
4913   for (auto *BB : TheLoop->blocks())
4914     for (auto &I : *BB) {
4915       // If there's no pointer operand, there's nothing to do.
4916       auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4917       if (!Ptr)
4918         continue;
4919 
4920       // True if all users of Ptr are memory accesses that have Ptr as their
4921       // pointer operand.
4922       auto UsersAreMemAccesses =
4923           llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4924             return getLoadStorePointerOperand(U) == Ptr;
4925           });
4926 
4927       // Ensure the memory instruction will not be scalarized or used by
4928       // gather/scatter, making its pointer operand non-uniform. If the pointer
4929       // operand is used by any instruction other than a memory access, we
4930       // conservatively assume the pointer operand may be non-uniform.
4931       if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
4932         PossibleNonUniformPtrs.insert(Ptr);
4933 
4934       // If the memory instruction will be vectorized and its pointer operand
4935       // is consecutive-like, or interleaving - the pointer operand should
4936       // remain uniform.
4937       else
4938         ConsecutiveLikePtrs.insert(Ptr);
4939     }
4940 
4941   // Add to the Worklist all consecutive and consecutive-like pointers that
4942   // aren't also identified as possibly non-uniform.
4943   for (auto *V : ConsecutiveLikePtrs)
4944     if (!PossibleNonUniformPtrs.count(V))
4945       addToWorklistIfAllowed(V);
4946 
4947   // Expand Worklist in topological order: whenever a new instruction
4948   // is added , its users should be already inside Worklist.  It ensures
4949   // a uniform instruction will only be used by uniform instructions.
4950   unsigned idx = 0;
4951   while (idx != Worklist.size()) {
4952     Instruction *I = Worklist[idx++];
4953 
4954     for (auto OV : I->operand_values()) {
4955       // isOutOfScope operands cannot be uniform instructions.
4956       if (isOutOfScope(OV))
4957         continue;
4958       // First order recurrence Phi's should typically be considered
4959       // non-uniform.
4960       auto *OP = dyn_cast<PHINode>(OV);
4961       if (OP && Legal->isFirstOrderRecurrence(OP))
4962         continue;
4963       // If all the users of the operand are uniform, then add the
4964       // operand into the uniform worklist.
4965       auto *OI = cast<Instruction>(OV);
4966       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4967             auto *J = cast<Instruction>(U);
4968             return Worklist.count(J) ||
4969                    (OI == getLoadStorePointerOperand(J) &&
4970                     isUniformDecision(J, VF));
4971           }))
4972         addToWorklistIfAllowed(OI);
4973     }
4974   }
4975 
4976   // Returns true if Ptr is the pointer operand of a memory access instruction
4977   // I, and I is known to not require scalarization.
4978   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4979     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4980   };
4981 
4982   // For an instruction to be added into Worklist above, all its users inside
4983   // the loop should also be in Worklist. However, this condition cannot be
4984   // true for phi nodes that form a cyclic dependence. We must process phi
4985   // nodes separately. An induction variable will remain uniform if all users
4986   // of the induction variable and induction variable update remain uniform.
4987   // The code below handles both pointer and non-pointer induction variables.
4988   for (auto &Induction : Legal->getInductionVars()) {
4989     auto *Ind = Induction.first;
4990     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4991 
4992     // Determine if all users of the induction variable are uniform after
4993     // vectorization.
4994     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4995       auto *I = cast<Instruction>(U);
4996       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4997              isVectorizedMemAccessUse(I, Ind);
4998     });
4999     if (!UniformInd)
5000       continue;
5001 
5002     // Determine if all users of the induction variable update instruction are
5003     // uniform after vectorization.
5004     auto UniformIndUpdate =
5005         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5006           auto *I = cast<Instruction>(U);
5007           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
5008                  isVectorizedMemAccessUse(I, IndUpdate);
5009         });
5010     if (!UniformIndUpdate)
5011       continue;
5012 
5013     // The induction variable and its update instruction will remain uniform.
5014     addToWorklistIfAllowed(Ind);
5015     addToWorklistIfAllowed(IndUpdate);
5016   }
5017 
5018   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
5019 }
5020 
5021 bool LoopVectorizationCostModel::runtimeChecksRequired() {
5022   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
5023 
5024   if (Legal->getRuntimePointerChecking()->Need) {
5025     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
5026         "runtime pointer checks needed. Enable vectorization of this "
5027         "loop with '#pragma clang loop vectorize(enable)' when "
5028         "compiling with -Os/-Oz",
5029         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5030     return true;
5031   }
5032 
5033   if (!PSE.getUnionPredicate().getPredicates().empty()) {
5034     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
5035         "runtime SCEV checks needed. Enable vectorization of this "
5036         "loop with '#pragma clang loop vectorize(enable)' when "
5037         "compiling with -Os/-Oz",
5038         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5039     return true;
5040   }
5041 
5042   // FIXME: Avoid specializing for stride==1 instead of bailing out.
5043   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
5044     reportVectorizationFailure("Runtime stride check for small trip count",
5045         "runtime stride == 1 checks needed. Enable vectorization of "
5046         "this loop without such check by compiling with -Os/-Oz",
5047         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5048     return true;
5049   }
5050 
5051   return false;
5052 }
5053 
5054 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF,
5055                                                             unsigned UserIC) {
5056   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5057     // TODO: It may by useful to do since it's still likely to be dynamically
5058     // uniform if the target can skip.
5059     reportVectorizationFailure(
5060         "Not inserting runtime ptr check for divergent target",
5061         "runtime pointer checks needed. Not enabled for divergent target",
5062         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5063     return None;
5064   }
5065 
5066   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5067   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5068   if (TC == 1) {
5069     reportVectorizationFailure("Single iteration (non) loop",
5070         "loop trip count is one, irrelevant for vectorization",
5071         "SingleIterationLoop", ORE, TheLoop);
5072     return None;
5073   }
5074 
5075   switch (ScalarEpilogueStatus) {
5076   case CM_ScalarEpilogueAllowed:
5077     return UserVF ? UserVF : computeFeasibleMaxVF(TC);
5078   case CM_ScalarEpilogueNotNeededUsePredicate:
5079     LLVM_DEBUG(
5080         dbgs() << "LV: vector predicate hint/switch found.\n"
5081                << "LV: Not allowing scalar epilogue, creating predicated "
5082                << "vector loop.\n");
5083     break;
5084   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5085     // fallthrough as a special case of OptForSize
5086   case CM_ScalarEpilogueNotAllowedOptSize:
5087     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5088       LLVM_DEBUG(
5089           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5090     else
5091       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5092                         << "count.\n");
5093 
5094     // Bail if runtime checks are required, which are not good when optimising
5095     // for size.
5096     if (runtimeChecksRequired())
5097       return None;
5098     break;
5099   }
5100 
5101   // Now try the tail folding
5102 
5103   // Invalidate interleave groups that require an epilogue if we can't mask
5104   // the interleave-group.
5105   if (!useMaskedInterleavedAccesses(TTI)) {
5106     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5107            "No decisions should have been taken at this point");
5108     // Note: There is no need to invalidate any cost modeling decisions here, as
5109     // non where taken so far.
5110     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5111   }
5112 
5113   unsigned MaxVF = UserVF ? UserVF : computeFeasibleMaxVF(TC);
5114   assert((UserVF || isPowerOf2_32(MaxVF)) && "MaxVF must be a power of 2");
5115   unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF;
5116   if (TC > 0 && TC % MaxVFtimesIC == 0) {
5117     // Accept MaxVF if we do not have a tail.
5118     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5119     return MaxVF;
5120   }
5121 
5122   // If we don't know the precise trip count, or if the trip count that we
5123   // found modulo the vectorization factor is not zero, try to fold the tail
5124   // by masking.
5125   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5126   if (Legal->prepareToFoldTailByMasking()) {
5127     FoldTailByMasking = true;
5128     return MaxVF;
5129   }
5130 
5131   if (TC == 0) {
5132     reportVectorizationFailure(
5133         "Unable to calculate the loop count due to complex control flow",
5134         "unable to calculate the loop count due to complex control flow",
5135         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5136     return None;
5137   }
5138 
5139   reportVectorizationFailure(
5140       "Cannot optimize for size and vectorize at the same time.",
5141       "cannot optimize for size and vectorize at the same time. "
5142       "Enable vectorization of this loop with '#pragma clang loop "
5143       "vectorize(enable)' when compiling with -Os/-Oz",
5144       "NoTailLoopWithOptForSize", ORE, TheLoop);
5145   return None;
5146 }
5147 
5148 unsigned
5149 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
5150   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5151   unsigned SmallestType, WidestType;
5152   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5153   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
5154 
5155   // Get the maximum safe dependence distance in bits computed by LAA.
5156   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5157   // the memory accesses that is most restrictive (involved in the smallest
5158   // dependence distance).
5159   unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
5160 
5161   WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
5162 
5163   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5164   // Note that both WidestRegister and WidestType may not be a powers of 2.
5165   unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType);
5166 
5167   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5168                     << " / " << WidestType << " bits.\n");
5169   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5170                     << WidestRegister << " bits.\n");
5171 
5172   assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
5173                                  " into one vector!");
5174   if (MaxVectorSize == 0) {
5175     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
5176     MaxVectorSize = 1;
5177     return MaxVectorSize;
5178   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
5179              isPowerOf2_32(ConstTripCount)) {
5180     // We need to clamp the VF to be the ConstTripCount. There is no point in
5181     // choosing a higher viable VF as done in the loop below.
5182     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5183                       << ConstTripCount << "\n");
5184     MaxVectorSize = ConstTripCount;
5185     return MaxVectorSize;
5186   }
5187 
5188   unsigned MaxVF = MaxVectorSize;
5189   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5190       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5191     // Collect all viable vectorization factors larger than the default MaxVF
5192     // (i.e. MaxVectorSize).
5193     SmallVector<unsigned, 8> VFs;
5194     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
5195     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
5196       VFs.push_back(VS);
5197 
5198     // For each VF calculate its register usage.
5199     auto RUs = calculateRegisterUsage(VFs);
5200 
5201     // Select the largest VF which doesn't require more registers than existing
5202     // ones.
5203     for (int i = RUs.size() - 1; i >= 0; --i) {
5204       bool Selected = true;
5205       for (auto& pair : RUs[i].MaxLocalUsers) {
5206         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5207         if (pair.second > TargetNumRegisters)
5208           Selected = false;
5209       }
5210       if (Selected) {
5211         MaxVF = VFs[i];
5212         break;
5213       }
5214     }
5215     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
5216       if (MaxVF < MinVF) {
5217         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5218                           << ") with target's minimum: " << MinVF << '\n');
5219         MaxVF = MinVF;
5220       }
5221     }
5222   }
5223   return MaxVF;
5224 }
5225 
5226 VectorizationFactor
5227 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
5228   float Cost = expectedCost(1).first;
5229   const float ScalarCost = Cost;
5230   unsigned Width = 1;
5231   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
5232 
5233   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5234   if (ForceVectorization && MaxVF > 1) {
5235     // Ignore scalar width, because the user explicitly wants vectorization.
5236     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5237     // evaluation.
5238     Cost = std::numeric_limits<float>::max();
5239   }
5240 
5241   for (unsigned i = 2; i <= MaxVF; i *= 2) {
5242     // Notice that the vector loop needs to be executed less times, so
5243     // we need to divide the cost of the vector loops by the width of
5244     // the vector elements.
5245     VectorizationCostTy C = expectedCost(i);
5246     float VectorCost = C.first / (float)i;
5247     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5248                       << " costs: " << (int)VectorCost << ".\n");
5249     if (!C.second && !ForceVectorization) {
5250       LLVM_DEBUG(
5251           dbgs() << "LV: Not considering vector loop of width " << i
5252                  << " because it will not generate any vector instructions.\n");
5253       continue;
5254     }
5255     if (VectorCost < Cost) {
5256       Cost = VectorCost;
5257       Width = i;
5258     }
5259   }
5260 
5261   if (!EnableCondStoresVectorization && NumPredStores) {
5262     reportVectorizationFailure("There are conditional stores.",
5263         "store that is conditionally executed prevents vectorization",
5264         "ConditionalStore", ORE, TheLoop);
5265     Width = 1;
5266     Cost = ScalarCost;
5267   }
5268 
5269   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5270              << "LV: Vectorization seems to be not beneficial, "
5271              << "but was forced by a user.\n");
5272   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5273   VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
5274   return Factor;
5275 }
5276 
5277 std::pair<unsigned, unsigned>
5278 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5279   unsigned MinWidth = -1U;
5280   unsigned MaxWidth = 8;
5281   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5282 
5283   // For each block.
5284   for (BasicBlock *BB : TheLoop->blocks()) {
5285     // For each instruction in the loop.
5286     for (Instruction &I : BB->instructionsWithoutDebug()) {
5287       Type *T = I.getType();
5288 
5289       // Skip ignored values.
5290       if (ValuesToIgnore.count(&I))
5291         continue;
5292 
5293       // Only examine Loads, Stores and PHINodes.
5294       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5295         continue;
5296 
5297       // Examine PHI nodes that are reduction variables. Update the type to
5298       // account for the recurrence type.
5299       if (auto *PN = dyn_cast<PHINode>(&I)) {
5300         if (!Legal->isReductionVariable(PN))
5301           continue;
5302         RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
5303         T = RdxDesc.getRecurrenceType();
5304       }
5305 
5306       // Examine the stored values.
5307       if (auto *ST = dyn_cast<StoreInst>(&I))
5308         T = ST->getValueOperand()->getType();
5309 
5310       // Ignore loaded pointer types and stored pointer types that are not
5311       // vectorizable.
5312       //
5313       // FIXME: The check here attempts to predict whether a load or store will
5314       //        be vectorized. We only know this for certain after a VF has
5315       //        been selected. Here, we assume that if an access can be
5316       //        vectorized, it will be. We should also look at extending this
5317       //        optimization to non-pointer types.
5318       //
5319       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5320           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5321         continue;
5322 
5323       MinWidth = std::min(MinWidth,
5324                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5325       MaxWidth = std::max(MaxWidth,
5326                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5327     }
5328   }
5329 
5330   return {MinWidth, MaxWidth};
5331 }
5332 
5333 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
5334                                                            unsigned LoopCost) {
5335   // -- The interleave heuristics --
5336   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5337   // There are many micro-architectural considerations that we can't predict
5338   // at this level. For example, frontend pressure (on decode or fetch) due to
5339   // code size, or the number and capabilities of the execution ports.
5340   //
5341   // We use the following heuristics to select the interleave count:
5342   // 1. If the code has reductions, then we interleave to break the cross
5343   // iteration dependency.
5344   // 2. If the loop is really small, then we interleave to reduce the loop
5345   // overhead.
5346   // 3. We don't interleave if we think that we will spill registers to memory
5347   // due to the increased register pressure.
5348 
5349   if (!isScalarEpilogueAllowed())
5350     return 1;
5351 
5352   // We used the distance for the interleave count.
5353   if (Legal->getMaxSafeDepDistBytes() != -1U)
5354     return 1;
5355 
5356   // Do not interleave loops with a relatively small known or estimated trip
5357   // count.
5358   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5359   if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold)
5360     return 1;
5361 
5362   RegisterUsage R = calculateRegisterUsage({VF})[0];
5363   // We divide by these constants so assume that we have at least one
5364   // instruction that uses at least one register.
5365   for (auto& pair : R.MaxLocalUsers) {
5366     pair.second = std::max(pair.second, 1U);
5367   }
5368 
5369   // We calculate the interleave count using the following formula.
5370   // Subtract the number of loop invariants from the number of available
5371   // registers. These registers are used by all of the interleaved instances.
5372   // Next, divide the remaining registers by the number of registers that is
5373   // required by the loop, in order to estimate how many parallel instances
5374   // fit without causing spills. All of this is rounded down if necessary to be
5375   // a power of two. We want power of two interleave count to simplify any
5376   // addressing operations or alignment considerations.
5377   // We also want power of two interleave counts to ensure that the induction
5378   // variable of the vector loop wraps to zero, when tail is folded by masking;
5379   // this currently happens when OptForSize, in which case IC is set to 1 above.
5380   unsigned IC = UINT_MAX;
5381 
5382   for (auto& pair : R.MaxLocalUsers) {
5383     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5384     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5385                       << " registers of "
5386                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5387     if (VF == 1) {
5388       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5389         TargetNumRegisters = ForceTargetNumScalarRegs;
5390     } else {
5391       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5392         TargetNumRegisters = ForceTargetNumVectorRegs;
5393     }
5394     unsigned MaxLocalUsers = pair.second;
5395     unsigned LoopInvariantRegs = 0;
5396     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5397       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5398 
5399     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5400     // Don't count the induction variable as interleaved.
5401     if (EnableIndVarRegisterHeur) {
5402       TmpIC =
5403           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5404                         std::max(1U, (MaxLocalUsers - 1)));
5405     }
5406 
5407     IC = std::min(IC, TmpIC);
5408   }
5409 
5410   // Clamp the interleave ranges to reasonable counts.
5411   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5412 
5413   // Check if the user has overridden the max.
5414   if (VF == 1) {
5415     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5416       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5417   } else {
5418     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5419       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5420   }
5421 
5422   // If trip count is known or estimated compile time constant, limit the
5423   // interleave count to be less than the trip count divided by VF.
5424   if (BestKnownTC) {
5425     MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount);
5426   }
5427 
5428   // If we did not calculate the cost for VF (because the user selected the VF)
5429   // then we calculate the cost of VF here.
5430   if (LoopCost == 0)
5431     LoopCost = expectedCost(VF).first;
5432 
5433   assert(LoopCost && "Non-zero loop cost expected");
5434 
5435   // Clamp the calculated IC to be between the 1 and the max interleave count
5436   // that the target and trip count allows.
5437   if (IC > MaxInterleaveCount)
5438     IC = MaxInterleaveCount;
5439   else if (IC < 1)
5440     IC = 1;
5441 
5442   // Interleave if we vectorized this loop and there is a reduction that could
5443   // benefit from interleaving.
5444   if (VF > 1 && !Legal->getReductionVars().empty()) {
5445     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5446     return IC;
5447   }
5448 
5449   // Note that if we've already vectorized the loop we will have done the
5450   // runtime check and so interleaving won't require further checks.
5451   bool InterleavingRequiresRuntimePointerCheck =
5452       (VF == 1 && Legal->getRuntimePointerChecking()->Need);
5453 
5454   // We want to interleave small loops in order to reduce the loop overhead and
5455   // potentially expose ILP opportunities.
5456   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
5457   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5458     // We assume that the cost overhead is 1 and we use the cost model
5459     // to estimate the cost of the loop and interleave until the cost of the
5460     // loop overhead is about 5% of the cost of the loop.
5461     unsigned SmallIC =
5462         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5463 
5464     // Interleave until store/load ports (estimated by max interleave count) are
5465     // saturated.
5466     unsigned NumStores = Legal->getNumStores();
5467     unsigned NumLoads = Legal->getNumLoads();
5468     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5469     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5470 
5471     // If we have a scalar reduction (vector reductions are already dealt with
5472     // by this point), we can increase the critical path length if the loop
5473     // we're interleaving is inside another loop. Limit, by default to 2, so the
5474     // critical path only gets increased by one reduction operation.
5475     if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) {
5476       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5477       SmallIC = std::min(SmallIC, F);
5478       StoresIC = std::min(StoresIC, F);
5479       LoadsIC = std::min(LoadsIC, F);
5480     }
5481 
5482     if (EnableLoadStoreRuntimeInterleave &&
5483         std::max(StoresIC, LoadsIC) > SmallIC) {
5484       LLVM_DEBUG(
5485           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5486       return std::max(StoresIC, LoadsIC);
5487     }
5488 
5489     LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5490     return SmallIC;
5491   }
5492 
5493   // Interleave if this is a large loop (small loops are already dealt with by
5494   // this point) that could benefit from interleaving.
5495   bool HasReductions = !Legal->getReductionVars().empty();
5496   if (TTI.enableAggressiveInterleaving(HasReductions)) {
5497     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5498     return IC;
5499   }
5500 
5501   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5502   return 1;
5503 }
5504 
5505 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5506 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
5507   // This function calculates the register usage by measuring the highest number
5508   // of values that are alive at a single location. Obviously, this is a very
5509   // rough estimation. We scan the loop in a topological order in order and
5510   // assign a number to each instruction. We use RPO to ensure that defs are
5511   // met before their users. We assume that each instruction that has in-loop
5512   // users starts an interval. We record every time that an in-loop value is
5513   // used, so we have a list of the first and last occurrences of each
5514   // instruction. Next, we transpose this data structure into a multi map that
5515   // holds the list of intervals that *end* at a specific location. This multi
5516   // map allows us to perform a linear search. We scan the instructions linearly
5517   // and record each time that a new interval starts, by placing it in a set.
5518   // If we find this value in the multi-map then we remove it from the set.
5519   // The max register usage is the maximum size of the set.
5520   // We also search for instructions that are defined outside the loop, but are
5521   // used inside the loop. We need this number separately from the max-interval
5522   // usage number because when we unroll, loop-invariant values do not take
5523   // more register.
5524   LoopBlocksDFS DFS(TheLoop);
5525   DFS.perform(LI);
5526 
5527   RegisterUsage RU;
5528 
5529   // Each 'key' in the map opens a new interval. The values
5530   // of the map are the index of the 'last seen' usage of the
5531   // instruction that is the key.
5532   using IntervalMap = DenseMap<Instruction *, unsigned>;
5533 
5534   // Maps instruction to its index.
5535   SmallVector<Instruction *, 64> IdxToInstr;
5536   // Marks the end of each interval.
5537   IntervalMap EndPoint;
5538   // Saves the list of instruction indices that are used in the loop.
5539   SmallPtrSet<Instruction *, 8> Ends;
5540   // Saves the list of values that are used in the loop but are
5541   // defined outside the loop, such as arguments and constants.
5542   SmallPtrSet<Value *, 8> LoopInvariants;
5543 
5544   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5545     for (Instruction &I : BB->instructionsWithoutDebug()) {
5546       IdxToInstr.push_back(&I);
5547 
5548       // Save the end location of each USE.
5549       for (Value *U : I.operands()) {
5550         auto *Instr = dyn_cast<Instruction>(U);
5551 
5552         // Ignore non-instruction values such as arguments, constants, etc.
5553         if (!Instr)
5554           continue;
5555 
5556         // If this instruction is outside the loop then record it and continue.
5557         if (!TheLoop->contains(Instr)) {
5558           LoopInvariants.insert(Instr);
5559           continue;
5560         }
5561 
5562         // Overwrite previous end points.
5563         EndPoint[Instr] = IdxToInstr.size();
5564         Ends.insert(Instr);
5565       }
5566     }
5567   }
5568 
5569   // Saves the list of intervals that end with the index in 'key'.
5570   using InstrList = SmallVector<Instruction *, 2>;
5571   DenseMap<unsigned, InstrList> TransposeEnds;
5572 
5573   // Transpose the EndPoints to a list of values that end at each index.
5574   for (auto &Interval : EndPoint)
5575     TransposeEnds[Interval.second].push_back(Interval.first);
5576 
5577   SmallPtrSet<Instruction *, 8> OpenIntervals;
5578 
5579   // Get the size of the widest register.
5580   unsigned MaxSafeDepDist = -1U;
5581   if (Legal->getMaxSafeDepDistBytes() != -1U)
5582     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5583   unsigned WidestRegister =
5584       std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
5585   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5586 
5587   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5588   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5589 
5590   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5591 
5592   // A lambda that gets the register usage for the given type and VF.
5593   auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
5594     if (Ty->isTokenTy())
5595       return 0U;
5596     unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5597     return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
5598   };
5599 
5600   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5601     Instruction *I = IdxToInstr[i];
5602 
5603     // Remove all of the instructions that end at this location.
5604     InstrList &List = TransposeEnds[i];
5605     for (Instruction *ToRemove : List)
5606       OpenIntervals.erase(ToRemove);
5607 
5608     // Ignore instructions that are never used within the loop.
5609     if (!Ends.count(I))
5610       continue;
5611 
5612     // Skip ignored values.
5613     if (ValuesToIgnore.count(I))
5614       continue;
5615 
5616     // For each VF find the maximum usage of registers.
5617     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5618       // Count the number of live intervals.
5619       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5620 
5621       if (VFs[j] == 1) {
5622         for (auto Inst : OpenIntervals) {
5623           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5624           if (RegUsage.find(ClassID) == RegUsage.end())
5625             RegUsage[ClassID] = 1;
5626           else
5627             RegUsage[ClassID] += 1;
5628         }
5629       } else {
5630         collectUniformsAndScalars(VFs[j]);
5631         for (auto Inst : OpenIntervals) {
5632           // Skip ignored values for VF > 1.
5633           if (VecValuesToIgnore.count(Inst))
5634             continue;
5635           if (isScalarAfterVectorization(Inst, VFs[j])) {
5636             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5637             if (RegUsage.find(ClassID) == RegUsage.end())
5638               RegUsage[ClassID] = 1;
5639             else
5640               RegUsage[ClassID] += 1;
5641           } else {
5642             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
5643             if (RegUsage.find(ClassID) == RegUsage.end())
5644               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
5645             else
5646               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5647           }
5648         }
5649       }
5650 
5651       for (auto& pair : RegUsage) {
5652         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
5653           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
5654         else
5655           MaxUsages[j][pair.first] = pair.second;
5656       }
5657     }
5658 
5659     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5660                       << OpenIntervals.size() << '\n');
5661 
5662     // Add the current instruction to the list of open intervals.
5663     OpenIntervals.insert(I);
5664   }
5665 
5666   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5667     SmallMapVector<unsigned, unsigned, 4> Invariant;
5668 
5669     for (auto Inst : LoopInvariants) {
5670       unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
5671       unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType());
5672       if (Invariant.find(ClassID) == Invariant.end())
5673         Invariant[ClassID] = Usage;
5674       else
5675         Invariant[ClassID] += Usage;
5676     }
5677 
5678     LLVM_DEBUG({
5679       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5680       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5681              << " item\n";
5682       for (const auto &pair : MaxUsages[i]) {
5683         dbgs() << "LV(REG): RegisterClass: "
5684                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5685                << " registers\n";
5686       }
5687       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5688              << " item\n";
5689       for (const auto &pair : Invariant) {
5690         dbgs() << "LV(REG): RegisterClass: "
5691                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5692                << " registers\n";
5693       }
5694     });
5695 
5696     RU.LoopInvariantRegs = Invariant;
5697     RU.MaxLocalUsers = MaxUsages[i];
5698     RUs[i] = RU;
5699   }
5700 
5701   return RUs;
5702 }
5703 
5704 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5705   // TODO: Cost model for emulated masked load/store is completely
5706   // broken. This hack guides the cost model to use an artificially
5707   // high enough value to practically disable vectorization with such
5708   // operations, except where previously deployed legality hack allowed
5709   // using very low cost values. This is to avoid regressions coming simply
5710   // from moving "masked load/store" check from legality to cost model.
5711   // Masked Load/Gather emulation was previously never allowed.
5712   // Limited number of Masked Store/Scatter emulation was allowed.
5713   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
5714   return isa<LoadInst>(I) ||
5715          (isa<StoreInst>(I) &&
5716           NumPredStores > NumberOfStoresToPredicate);
5717 }
5718 
5719 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
5720   // If we aren't vectorizing the loop, or if we've already collected the
5721   // instructions to scalarize, there's nothing to do. Collection may already
5722   // have occurred if we have a user-selected VF and are now computing the
5723   // expected cost for interleaving.
5724   if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end())
5725     return;
5726 
5727   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5728   // not profitable to scalarize any instructions, the presence of VF in the
5729   // map will indicate that we've analyzed it already.
5730   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5731 
5732   // Find all the instructions that are scalar with predication in the loop and
5733   // determine if it would be better to not if-convert the blocks they are in.
5734   // If so, we also record the instructions to scalarize.
5735   for (BasicBlock *BB : TheLoop->blocks()) {
5736     if (!blockNeedsPredication(BB))
5737       continue;
5738     for (Instruction &I : *BB)
5739       if (isScalarWithPredication(&I)) {
5740         ScalarCostsTy ScalarCosts;
5741         // Do not apply discount logic if hacked cost is needed
5742         // for emulated masked memrefs.
5743         if (!useEmulatedMaskMemRefHack(&I) &&
5744             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5745           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5746         // Remember that BB will remain after vectorization.
5747         PredicatedBBsAfterVectorization.insert(BB);
5748       }
5749   }
5750 }
5751 
5752 int LoopVectorizationCostModel::computePredInstDiscount(
5753     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5754     unsigned VF) {
5755   assert(!isUniformAfterVectorization(PredInst, VF) &&
5756          "Instruction marked uniform-after-vectorization will be predicated");
5757 
5758   // Initialize the discount to zero, meaning that the scalar version and the
5759   // vector version cost the same.
5760   int Discount = 0;
5761 
5762   // Holds instructions to analyze. The instructions we visit are mapped in
5763   // ScalarCosts. Those instructions are the ones that would be scalarized if
5764   // we find that the scalar version costs less.
5765   SmallVector<Instruction *, 8> Worklist;
5766 
5767   // Returns true if the given instruction can be scalarized.
5768   auto canBeScalarized = [&](Instruction *I) -> bool {
5769     // We only attempt to scalarize instructions forming a single-use chain
5770     // from the original predicated block that would otherwise be vectorized.
5771     // Although not strictly necessary, we give up on instructions we know will
5772     // already be scalar to avoid traversing chains that are unlikely to be
5773     // beneficial.
5774     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5775         isScalarAfterVectorization(I, VF))
5776       return false;
5777 
5778     // If the instruction is scalar with predication, it will be analyzed
5779     // separately. We ignore it within the context of PredInst.
5780     if (isScalarWithPredication(I))
5781       return false;
5782 
5783     // If any of the instruction's operands are uniform after vectorization,
5784     // the instruction cannot be scalarized. This prevents, for example, a
5785     // masked load from being scalarized.
5786     //
5787     // We assume we will only emit a value for lane zero of an instruction
5788     // marked uniform after vectorization, rather than VF identical values.
5789     // Thus, if we scalarize an instruction that uses a uniform, we would
5790     // create uses of values corresponding to the lanes we aren't emitting code
5791     // for. This behavior can be changed by allowing getScalarValue to clone
5792     // the lane zero values for uniforms rather than asserting.
5793     for (Use &U : I->operands())
5794       if (auto *J = dyn_cast<Instruction>(U.get()))
5795         if (isUniformAfterVectorization(J, VF))
5796           return false;
5797 
5798     // Otherwise, we can scalarize the instruction.
5799     return true;
5800   };
5801 
5802   // Compute the expected cost discount from scalarizing the entire expression
5803   // feeding the predicated instruction. We currently only consider expressions
5804   // that are single-use instruction chains.
5805   Worklist.push_back(PredInst);
5806   while (!Worklist.empty()) {
5807     Instruction *I = Worklist.pop_back_val();
5808 
5809     // If we've already analyzed the instruction, there's nothing to do.
5810     if (ScalarCosts.find(I) != ScalarCosts.end())
5811       continue;
5812 
5813     // Compute the cost of the vector instruction. Note that this cost already
5814     // includes the scalarization overhead of the predicated instruction.
5815     unsigned VectorCost = getInstructionCost(I, VF).first;
5816 
5817     // Compute the cost of the scalarized instruction. This cost is the cost of
5818     // the instruction as if it wasn't if-converted and instead remained in the
5819     // predicated block. We will scale this cost by block probability after
5820     // computing the scalarization overhead.
5821     unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
5822 
5823     // Compute the scalarization overhead of needed insertelement instructions
5824     // and phi nodes.
5825     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
5826       ScalarCost += TTI.getScalarizationOverhead(
5827           cast<VectorType>(ToVectorTy(I->getType(), VF)),
5828           APInt::getAllOnesValue(VF), true, false);
5829       ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI,
5830                                             TTI::TCK_RecipThroughput);
5831     }
5832 
5833     // Compute the scalarization overhead of needed extractelement
5834     // instructions. For each of the instruction's operands, if the operand can
5835     // be scalarized, add it to the worklist; otherwise, account for the
5836     // overhead.
5837     for (Use &U : I->operands())
5838       if (auto *J = dyn_cast<Instruction>(U.get())) {
5839         assert(VectorType::isValidElementType(J->getType()) &&
5840                "Instruction has non-scalar type");
5841         if (canBeScalarized(J))
5842           Worklist.push_back(J);
5843         else if (needsExtract(J, VF))
5844           ScalarCost += TTI.getScalarizationOverhead(
5845               cast<VectorType>(ToVectorTy(J->getType(), VF)),
5846               APInt::getAllOnesValue(VF), false, true);
5847       }
5848 
5849     // Scale the total scalar cost by block probability.
5850     ScalarCost /= getReciprocalPredBlockProb();
5851 
5852     // Compute the discount. A non-negative discount means the vector version
5853     // of the instruction costs more, and scalarizing would be beneficial.
5854     Discount += VectorCost - ScalarCost;
5855     ScalarCosts[I] = ScalarCost;
5856   }
5857 
5858   return Discount;
5859 }
5860 
5861 LoopVectorizationCostModel::VectorizationCostTy
5862 LoopVectorizationCostModel::expectedCost(unsigned VF) {
5863   VectorizationCostTy Cost;
5864 
5865   // For each block.
5866   for (BasicBlock *BB : TheLoop->blocks()) {
5867     VectorizationCostTy BlockCost;
5868 
5869     // For each instruction in the old loop.
5870     for (Instruction &I : BB->instructionsWithoutDebug()) {
5871       // Skip ignored values.
5872       if (ValuesToIgnore.count(&I) || (VF > 1 && VecValuesToIgnore.count(&I)))
5873         continue;
5874 
5875       VectorizationCostTy C = getInstructionCost(&I, VF);
5876 
5877       // Check if we should override the cost.
5878       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
5879         C.first = ForceTargetInstructionCost;
5880 
5881       BlockCost.first += C.first;
5882       BlockCost.second |= C.second;
5883       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5884                         << " for VF " << VF << " For instruction: " << I
5885                         << '\n');
5886     }
5887 
5888     // If we are vectorizing a predicated block, it will have been
5889     // if-converted. This means that the block's instructions (aside from
5890     // stores and instructions that may divide by zero) will now be
5891     // unconditionally executed. For the scalar case, we may not always execute
5892     // the predicated block. Thus, scale the block's cost by the probability of
5893     // executing it.
5894     if (VF == 1 && blockNeedsPredication(BB))
5895       BlockCost.first /= getReciprocalPredBlockProb();
5896 
5897     Cost.first += BlockCost.first;
5898     Cost.second |= BlockCost.second;
5899   }
5900 
5901   return Cost;
5902 }
5903 
5904 /// Gets Address Access SCEV after verifying that the access pattern
5905 /// is loop invariant except the induction variable dependence.
5906 ///
5907 /// This SCEV can be sent to the Target in order to estimate the address
5908 /// calculation cost.
5909 static const SCEV *getAddressAccessSCEV(
5910               Value *Ptr,
5911               LoopVectorizationLegality *Legal,
5912               PredicatedScalarEvolution &PSE,
5913               const Loop *TheLoop) {
5914 
5915   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5916   if (!Gep)
5917     return nullptr;
5918 
5919   // We are looking for a gep with all loop invariant indices except for one
5920   // which should be an induction variable.
5921   auto SE = PSE.getSE();
5922   unsigned NumOperands = Gep->getNumOperands();
5923   for (unsigned i = 1; i < NumOperands; ++i) {
5924     Value *Opd = Gep->getOperand(i);
5925     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5926         !Legal->isInductionVariable(Opd))
5927       return nullptr;
5928   }
5929 
5930   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5931   return PSE.getSCEV(Ptr);
5932 }
5933 
5934 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
5935   return Legal->hasStride(I->getOperand(0)) ||
5936          Legal->hasStride(I->getOperand(1));
5937 }
5938 
5939 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5940                                                                  unsigned VF) {
5941   assert(VF > 1 && "Scalarization cost of instruction implies vectorization.");
5942   Type *ValTy = getMemInstValueType(I);
5943   auto SE = PSE.getSE();
5944 
5945   unsigned AS = getLoadStoreAddressSpace(I);
5946   Value *Ptr = getLoadStorePointerOperand(I);
5947   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5948 
5949   // Figure out whether the access is strided and get the stride value
5950   // if it's known in compile time
5951   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5952 
5953   // Get the cost of the scalar memory instruction and address computation.
5954   unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5955 
5956   // Don't pass *I here, since it is scalar but will actually be part of a
5957   // vectorized loop where the user of it is a vectorized instruction.
5958   const Align Alignment = getLoadStoreAlignment(I);
5959   Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
5960                                    Alignment, AS,
5961                                    TTI::TCK_RecipThroughput);
5962 
5963   // Get the overhead of the extractelement and insertelement instructions
5964   // we might create due to scalarization.
5965   Cost += getScalarizationOverhead(I, VF);
5966 
5967   // If we have a predicated store, it may not be executed for each vector
5968   // lane. Scale the cost by the probability of executing the predicated
5969   // block.
5970   if (isPredicatedInst(I)) {
5971     Cost /= getReciprocalPredBlockProb();
5972 
5973     if (useEmulatedMaskMemRefHack(I))
5974       // Artificially setting to a high enough value to practically disable
5975       // vectorization with such operations.
5976       Cost = 3000000;
5977   }
5978 
5979   return Cost;
5980 }
5981 
5982 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5983                                                              unsigned VF) {
5984   Type *ValTy = getMemInstValueType(I);
5985   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
5986   Value *Ptr = getLoadStorePointerOperand(I);
5987   unsigned AS = getLoadStoreAddressSpace(I);
5988   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
5989   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5990 
5991   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5992          "Stride should be 1 or -1 for consecutive memory access");
5993   const Align Alignment = getLoadStoreAlignment(I);
5994   unsigned Cost = 0;
5995   if (Legal->isMaskRequired(I))
5996     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5997                                       CostKind);
5998   else
5999     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6000                                 CostKind, I);
6001 
6002   bool Reverse = ConsecutiveStride < 0;
6003   if (Reverse)
6004     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6005   return Cost;
6006 }
6007 
6008 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6009                                                          unsigned VF) {
6010   Type *ValTy = getMemInstValueType(I);
6011   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6012   const Align Alignment = getLoadStoreAlignment(I);
6013   unsigned AS = getLoadStoreAddressSpace(I);
6014   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6015   if (isa<LoadInst>(I)) {
6016     return TTI.getAddressComputationCost(ValTy) +
6017            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6018                                CostKind) +
6019            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6020   }
6021   StoreInst *SI = cast<StoreInst>(I);
6022 
6023   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6024   return TTI.getAddressComputationCost(ValTy) +
6025          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6026                              CostKind) +
6027          (isLoopInvariantStoreValue
6028               ? 0
6029               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6030                                        VF - 1));
6031 }
6032 
6033 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6034                                                           unsigned VF) {
6035   Type *ValTy = getMemInstValueType(I);
6036   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6037   const Align Alignment = getLoadStoreAlignment(I);
6038   const Value *Ptr = getLoadStorePointerOperand(I);
6039 
6040   return TTI.getAddressComputationCost(VectorTy) +
6041          TTI.getGatherScatterOpCost(
6042              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6043              TargetTransformInfo::TCK_RecipThroughput, I);
6044 }
6045 
6046 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6047                                                             unsigned VF) {
6048   Type *ValTy = getMemInstValueType(I);
6049   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6050   unsigned AS = getLoadStoreAddressSpace(I);
6051 
6052   auto Group = getInterleavedAccessGroup(I);
6053   assert(Group && "Fail to get an interleaved access group.");
6054 
6055   unsigned InterleaveFactor = Group->getFactor();
6056   auto *WideVecTy = FixedVectorType::get(ValTy, VF * InterleaveFactor);
6057 
6058   // Holds the indices of existing members in an interleaved load group.
6059   // An interleaved store group doesn't need this as it doesn't allow gaps.
6060   SmallVector<unsigned, 4> Indices;
6061   if (isa<LoadInst>(I)) {
6062     for (unsigned i = 0; i < InterleaveFactor; i++)
6063       if (Group->getMember(i))
6064         Indices.push_back(i);
6065   }
6066 
6067   // Calculate the cost of the whole interleaved group.
6068   bool UseMaskForGaps =
6069       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
6070   unsigned Cost = TTI.getInterleavedMemoryOpCost(
6071       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6072       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
6073 
6074   if (Group->isReverse()) {
6075     // TODO: Add support for reversed masked interleaved access.
6076     assert(!Legal->isMaskRequired(I) &&
6077            "Reverse masked interleaved access not supported.");
6078     Cost += Group->getNumMembers() *
6079             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6080   }
6081   return Cost;
6082 }
6083 
6084 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6085                                                               unsigned VF) {
6086   // Calculate scalar cost only. Vectorization cost should be ready at this
6087   // moment.
6088   if (VF == 1) {
6089     Type *ValTy = getMemInstValueType(I);
6090     const Align Alignment = getLoadStoreAlignment(I);
6091     unsigned AS = getLoadStoreAddressSpace(I);
6092 
6093     return TTI.getAddressComputationCost(ValTy) +
6094            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6095                                TTI::TCK_RecipThroughput, I);
6096   }
6097   return getWideningCost(I, VF);
6098 }
6099 
6100 LoopVectorizationCostModel::VectorizationCostTy
6101 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
6102   // If we know that this instruction will remain uniform, check the cost of
6103   // the scalar version.
6104   if (isUniformAfterVectorization(I, VF))
6105     VF = 1;
6106 
6107   if (VF > 1 && isProfitableToScalarize(I, VF))
6108     return VectorizationCostTy(InstsToScalarize[VF][I], false);
6109 
6110   // Forced scalars do not have any scalarization overhead.
6111   auto ForcedScalar = ForcedScalars.find(VF);
6112   if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
6113     auto InstSet = ForcedScalar->second;
6114     if (InstSet.count(I))
6115       return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
6116   }
6117 
6118   Type *VectorTy;
6119   unsigned C = getInstructionCost(I, VF, VectorTy);
6120 
6121   bool TypeNotScalarized =
6122       VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
6123   return VectorizationCostTy(C, TypeNotScalarized);
6124 }
6125 
6126 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
6127                                                               unsigned VF) {
6128 
6129   if (VF == 1)
6130     return 0;
6131 
6132   unsigned Cost = 0;
6133   Type *RetTy = ToVectorTy(I->getType(), VF);
6134   if (!RetTy->isVoidTy() &&
6135       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6136     Cost += TTI.getScalarizationOverhead(
6137         cast<VectorType>(RetTy), APInt::getAllOnesValue(VF), true, false);
6138 
6139   // Some targets keep addresses scalar.
6140   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6141     return Cost;
6142 
6143   // Some targets support efficient element stores.
6144   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6145     return Cost;
6146 
6147   // Collect operands to consider.
6148   CallInst *CI = dyn_cast<CallInst>(I);
6149   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
6150 
6151   // Skip operands that do not require extraction/scalarization and do not incur
6152   // any overhead.
6153   return Cost + TTI.getOperandsScalarizationOverhead(
6154                     filterExtractingOperands(Ops, VF), VF);
6155 }
6156 
6157 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
6158   if (VF == 1)
6159     return;
6160   NumPredStores = 0;
6161   for (BasicBlock *BB : TheLoop->blocks()) {
6162     // For each instruction in the old loop.
6163     for (Instruction &I : *BB) {
6164       Value *Ptr =  getLoadStorePointerOperand(&I);
6165       if (!Ptr)
6166         continue;
6167 
6168       // TODO: We should generate better code and update the cost model for
6169       // predicated uniform stores. Today they are treated as any other
6170       // predicated store (see added test cases in
6171       // invariant-store-vectorization.ll).
6172       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
6173         NumPredStores++;
6174 
6175       if (Legal->isUniform(Ptr) &&
6176           // Conditional loads and stores should be scalarized and predicated.
6177           // isScalarWithPredication cannot be used here since masked
6178           // gather/scatters are not considered scalar with predication.
6179           !Legal->blockNeedsPredication(I.getParent())) {
6180         // TODO: Avoid replicating loads and stores instead of
6181         // relying on instcombine to remove them.
6182         // Load: Scalar load + broadcast
6183         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6184         unsigned Cost = getUniformMemOpCost(&I, VF);
6185         setWideningDecision(&I, VF, CM_Scalarize, Cost);
6186         continue;
6187       }
6188 
6189       // We assume that widening is the best solution when possible.
6190       if (memoryInstructionCanBeWidened(&I, VF)) {
6191         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
6192         int ConsecutiveStride =
6193                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
6194         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6195                "Expected consecutive stride.");
6196         InstWidening Decision =
6197             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6198         setWideningDecision(&I, VF, Decision, Cost);
6199         continue;
6200       }
6201 
6202       // Choose between Interleaving, Gather/Scatter or Scalarization.
6203       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
6204       unsigned NumAccesses = 1;
6205       if (isAccessInterleaved(&I)) {
6206         auto Group = getInterleavedAccessGroup(&I);
6207         assert(Group && "Fail to get an interleaved access group.");
6208 
6209         // Make one decision for the whole group.
6210         if (getWideningDecision(&I, VF) != CM_Unknown)
6211           continue;
6212 
6213         NumAccesses = Group->getNumMembers();
6214         if (interleavedAccessCanBeWidened(&I, VF))
6215           InterleaveCost = getInterleaveGroupCost(&I, VF);
6216       }
6217 
6218       unsigned GatherScatterCost =
6219           isLegalGatherOrScatter(&I)
6220               ? getGatherScatterCost(&I, VF) * NumAccesses
6221               : std::numeric_limits<unsigned>::max();
6222 
6223       unsigned ScalarizationCost =
6224           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6225 
6226       // Choose better solution for the current VF,
6227       // write down this decision and use it during vectorization.
6228       unsigned Cost;
6229       InstWidening Decision;
6230       if (InterleaveCost <= GatherScatterCost &&
6231           InterleaveCost < ScalarizationCost) {
6232         Decision = CM_Interleave;
6233         Cost = InterleaveCost;
6234       } else if (GatherScatterCost < ScalarizationCost) {
6235         Decision = CM_GatherScatter;
6236         Cost = GatherScatterCost;
6237       } else {
6238         Decision = CM_Scalarize;
6239         Cost = ScalarizationCost;
6240       }
6241       // If the instructions belongs to an interleave group, the whole group
6242       // receives the same decision. The whole group receives the cost, but
6243       // the cost will actually be assigned to one instruction.
6244       if (auto Group = getInterleavedAccessGroup(&I))
6245         setWideningDecision(Group, VF, Decision, Cost);
6246       else
6247         setWideningDecision(&I, VF, Decision, Cost);
6248     }
6249   }
6250 
6251   // Make sure that any load of address and any other address computation
6252   // remains scalar unless there is gather/scatter support. This avoids
6253   // inevitable extracts into address registers, and also has the benefit of
6254   // activating LSR more, since that pass can't optimize vectorized
6255   // addresses.
6256   if (TTI.prefersVectorizedAddressing())
6257     return;
6258 
6259   // Start with all scalar pointer uses.
6260   SmallPtrSet<Instruction *, 8> AddrDefs;
6261   for (BasicBlock *BB : TheLoop->blocks())
6262     for (Instruction &I : *BB) {
6263       Instruction *PtrDef =
6264         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6265       if (PtrDef && TheLoop->contains(PtrDef) &&
6266           getWideningDecision(&I, VF) != CM_GatherScatter)
6267         AddrDefs.insert(PtrDef);
6268     }
6269 
6270   // Add all instructions used to generate the addresses.
6271   SmallVector<Instruction *, 4> Worklist;
6272   for (auto *I : AddrDefs)
6273     Worklist.push_back(I);
6274   while (!Worklist.empty()) {
6275     Instruction *I = Worklist.pop_back_val();
6276     for (auto &Op : I->operands())
6277       if (auto *InstOp = dyn_cast<Instruction>(Op))
6278         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6279             AddrDefs.insert(InstOp).second)
6280           Worklist.push_back(InstOp);
6281   }
6282 
6283   for (auto *I : AddrDefs) {
6284     if (isa<LoadInst>(I)) {
6285       // Setting the desired widening decision should ideally be handled in
6286       // by cost functions, but since this involves the task of finding out
6287       // if the loaded register is involved in an address computation, it is
6288       // instead changed here when we know this is the case.
6289       InstWidening Decision = getWideningDecision(I, VF);
6290       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6291         // Scalarize a widened load of address.
6292         setWideningDecision(I, VF, CM_Scalarize,
6293                             (VF * getMemoryInstructionCost(I, 1)));
6294       else if (auto Group = getInterleavedAccessGroup(I)) {
6295         // Scalarize an interleave group of address loads.
6296         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6297           if (Instruction *Member = Group->getMember(I))
6298             setWideningDecision(Member, VF, CM_Scalarize,
6299                                 (VF * getMemoryInstructionCost(Member, 1)));
6300         }
6301       }
6302     } else
6303       // Make sure I gets scalarized and a cost estimate without
6304       // scalarization overhead.
6305       ForcedScalars[VF].insert(I);
6306   }
6307 }
6308 
6309 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6310                                                         unsigned VF,
6311                                                         Type *&VectorTy) {
6312   Type *RetTy = I->getType();
6313   if (canTruncateToMinimalBitwidth(I, VF))
6314     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6315   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
6316   auto SE = PSE.getSE();
6317   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6318 
6319   // TODO: We need to estimate the cost of intrinsic calls.
6320   switch (I->getOpcode()) {
6321   case Instruction::GetElementPtr:
6322     // We mark this instruction as zero-cost because the cost of GEPs in
6323     // vectorized code depends on whether the corresponding memory instruction
6324     // is scalarized or not. Therefore, we handle GEPs with the memory
6325     // instruction cost.
6326     return 0;
6327   case Instruction::Br: {
6328     // In cases of scalarized and predicated instructions, there will be VF
6329     // predicated blocks in the vectorized loop. Each branch around these
6330     // blocks requires also an extract of its vector compare i1 element.
6331     bool ScalarPredicatedBB = false;
6332     BranchInst *BI = cast<BranchInst>(I);
6333     if (VF > 1 && BI->isConditional() &&
6334         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
6335          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
6336       ScalarPredicatedBB = true;
6337 
6338     if (ScalarPredicatedBB) {
6339       // Return cost for branches around scalarized and predicated blocks.
6340       auto *Vec_i1Ty =
6341           FixedVectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6342       return (TTI.getScalarizationOverhead(Vec_i1Ty, APInt::getAllOnesValue(VF),
6343                                            false, true) +
6344               (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF));
6345     } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
6346       // The back-edge branch will remain, as will all scalar branches.
6347       return TTI.getCFInstrCost(Instruction::Br, CostKind);
6348     else
6349       // This branch will be eliminated by if-conversion.
6350       return 0;
6351     // Note: We currently assume zero cost for an unconditional branch inside
6352     // a predicated block since it will become a fall-through, although we
6353     // may decide in the future to call TTI for all branches.
6354   }
6355   case Instruction::PHI: {
6356     auto *Phi = cast<PHINode>(I);
6357 
6358     // First-order recurrences are replaced by vector shuffles inside the loop.
6359     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6360     if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
6361       return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
6362                                 cast<VectorType>(VectorTy), VF - 1,
6363                                 FixedVectorType::get(RetTy, 1));
6364 
6365     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6366     // converted into select instructions. We require N - 1 selects per phi
6367     // node, where N is the number of incoming values.
6368     if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
6369       return (Phi->getNumIncomingValues() - 1) *
6370              TTI.getCmpSelInstrCost(
6371                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6372                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6373                  CostKind);
6374 
6375     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6376   }
6377   case Instruction::UDiv:
6378   case Instruction::SDiv:
6379   case Instruction::URem:
6380   case Instruction::SRem:
6381     // If we have a predicated instruction, it may not be executed for each
6382     // vector lane. Get the scalarization cost and scale this amount by the
6383     // probability of executing the predicated block. If the instruction is not
6384     // predicated, we fall through to the next case.
6385     if (VF > 1 && isScalarWithPredication(I)) {
6386       unsigned Cost = 0;
6387 
6388       // These instructions have a non-void type, so account for the phi nodes
6389       // that we will create. This cost is likely to be zero. The phi node
6390       // cost, if any, should be scaled by the block probability because it
6391       // models a copy at the end of each predicated block.
6392       Cost += VF * TTI.getCFInstrCost(Instruction::PHI, CostKind);
6393 
6394       // The cost of the non-predicated instruction.
6395       Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
6396 
6397       // The cost of insertelement and extractelement instructions needed for
6398       // scalarization.
6399       Cost += getScalarizationOverhead(I, VF);
6400 
6401       // Scale the cost by the probability of executing the predicated blocks.
6402       // This assumes the predicated block for each vector lane is equally
6403       // likely.
6404       return Cost / getReciprocalPredBlockProb();
6405     }
6406     LLVM_FALLTHROUGH;
6407   case Instruction::Add:
6408   case Instruction::FAdd:
6409   case Instruction::Sub:
6410   case Instruction::FSub:
6411   case Instruction::Mul:
6412   case Instruction::FMul:
6413   case Instruction::FDiv:
6414   case Instruction::FRem:
6415   case Instruction::Shl:
6416   case Instruction::LShr:
6417   case Instruction::AShr:
6418   case Instruction::And:
6419   case Instruction::Or:
6420   case Instruction::Xor: {
6421     // Since we will replace the stride by 1 the multiplication should go away.
6422     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
6423       return 0;
6424     // Certain instructions can be cheaper to vectorize if they have a constant
6425     // second vector operand. One example of this are shifts on x86.
6426     Value *Op2 = I->getOperand(1);
6427     TargetTransformInfo::OperandValueProperties Op2VP;
6428     TargetTransformInfo::OperandValueKind Op2VK =
6429         TTI.getOperandInfo(Op2, Op2VP);
6430     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
6431       Op2VK = TargetTransformInfo::OK_UniformValue;
6432 
6433     SmallVector<const Value *, 4> Operands(I->operand_values());
6434     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6435     return N * TTI.getArithmeticInstrCost(
6436                    I->getOpcode(), VectorTy, CostKind,
6437                    TargetTransformInfo::OK_AnyValue,
6438                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
6439   }
6440   case Instruction::FNeg: {
6441     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6442     return N * TTI.getArithmeticInstrCost(
6443                    I->getOpcode(), VectorTy, CostKind,
6444                    TargetTransformInfo::OK_AnyValue,
6445                    TargetTransformInfo::OK_AnyValue,
6446                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
6447                    I->getOperand(0), I);
6448   }
6449   case Instruction::Select: {
6450     SelectInst *SI = cast<SelectInst>(I);
6451     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6452     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6453     Type *CondTy = SI->getCondition()->getType();
6454     if (!ScalarCond)
6455       CondTy = FixedVectorType::get(CondTy, VF);
6456 
6457     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
6458                                   CostKind, I);
6459   }
6460   case Instruction::ICmp:
6461   case Instruction::FCmp: {
6462     Type *ValTy = I->getOperand(0)->getType();
6463     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6464     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6465       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6466     VectorTy = ToVectorTy(ValTy, VF);
6467     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, CostKind,
6468                                   I);
6469   }
6470   case Instruction::Store:
6471   case Instruction::Load: {
6472     unsigned Width = VF;
6473     if (Width > 1) {
6474       InstWidening Decision = getWideningDecision(I, Width);
6475       assert(Decision != CM_Unknown &&
6476              "CM decision should be taken at this point");
6477       if (Decision == CM_Scalarize)
6478         Width = 1;
6479     }
6480     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
6481     return getMemoryInstructionCost(I, VF);
6482   }
6483   case Instruction::ZExt:
6484   case Instruction::SExt:
6485   case Instruction::FPToUI:
6486   case Instruction::FPToSI:
6487   case Instruction::FPExt:
6488   case Instruction::PtrToInt:
6489   case Instruction::IntToPtr:
6490   case Instruction::SIToFP:
6491   case Instruction::UIToFP:
6492   case Instruction::Trunc:
6493   case Instruction::FPTrunc:
6494   case Instruction::BitCast: {
6495     // Computes the CastContextHint from a Load/Store instruction.
6496     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6497       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
6498              "Expected a load or a store!");
6499 
6500       if (VF == 1 || !TheLoop->contains(I))
6501         return TTI::CastContextHint::Normal;
6502 
6503       switch (getWideningDecision(I, VF)) {
6504       case LoopVectorizationCostModel::CM_GatherScatter:
6505         return TTI::CastContextHint::GatherScatter;
6506       case LoopVectorizationCostModel::CM_Interleave:
6507         return TTI::CastContextHint::Interleave;
6508       case LoopVectorizationCostModel::CM_Scalarize:
6509       case LoopVectorizationCostModel::CM_Widen:
6510         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
6511                                         : TTI::CastContextHint::Normal;
6512       case LoopVectorizationCostModel::CM_Widen_Reverse:
6513         return TTI::CastContextHint::Reversed;
6514       case LoopVectorizationCostModel::CM_Unknown:
6515         llvm_unreachable("Instr did not go through cost modelling?");
6516       }
6517 
6518       llvm_unreachable("Unhandled case!");
6519     };
6520 
6521     unsigned Opcode = I->getOpcode();
6522     TTI::CastContextHint CCH = TTI::CastContextHint::None;
6523     // For Trunc, the context is the only user, which must be a StoreInst.
6524     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
6525       if (I->hasOneUse())
6526         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
6527           CCH = ComputeCCH(Store);
6528     }
6529     // For Z/Sext, the context is the operand, which must be a LoadInst.
6530     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
6531              Opcode == Instruction::FPExt) {
6532       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
6533         CCH = ComputeCCH(Load);
6534     }
6535 
6536     // We optimize the truncation of induction variables having constant
6537     // integer steps. The cost of these truncations is the same as the scalar
6538     // operation.
6539     if (isOptimizableIVTruncate(I, VF)) {
6540       auto *Trunc = cast<TruncInst>(I);
6541       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6542                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
6543     }
6544 
6545     Type *SrcScalarTy = I->getOperand(0)->getType();
6546     Type *SrcVecTy =
6547         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6548     if (canTruncateToMinimalBitwidth(I, VF)) {
6549       // This cast is going to be shrunk. This may remove the cast or it might
6550       // turn it into slightly different cast. For example, if MinBW == 16,
6551       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6552       //
6553       // Calculate the modified src and dest types.
6554       Type *MinVecTy = VectorTy;
6555       if (Opcode == Instruction::Trunc) {
6556         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6557         VectorTy =
6558             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6559       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
6560         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6561         VectorTy =
6562             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6563       }
6564     }
6565 
6566     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6567     return N *
6568            TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
6569   }
6570   case Instruction::Call: {
6571     bool NeedToScalarize;
6572     CallInst *CI = cast<CallInst>(I);
6573     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6574     if (getVectorIntrinsicIDForCall(CI, TLI))
6575       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6576     return CallCost;
6577   }
6578   default:
6579     // The cost of executing VF copies of the scalar instruction. This opcode
6580     // is unknown. Assume that it is the same as 'mul'.
6581     return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy,
6582                                            CostKind) +
6583            getScalarizationOverhead(I, VF);
6584   } // end of switch.
6585 }
6586 
6587 char LoopVectorize::ID = 0;
6588 
6589 static const char lv_name[] = "Loop Vectorization";
6590 
6591 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
6592 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
6593 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
6594 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
6595 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
6596 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6597 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
6598 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6599 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
6600 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
6601 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
6602 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
6603 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
6604 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
6605 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
6606 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
6607 
6608 namespace llvm {
6609 
6610 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6611 
6612 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6613                               bool VectorizeOnlyWhenForced) {
6614   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6615 }
6616 
6617 } // end namespace llvm
6618 
6619 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6620   // Check if the pointer operand of a load or store instruction is
6621   // consecutive.
6622   if (auto *Ptr = getLoadStorePointerOperand(Inst))
6623     return Legal->isConsecutivePtr(Ptr);
6624   return false;
6625 }
6626 
6627 void LoopVectorizationCostModel::collectValuesToIgnore() {
6628   // Ignore ephemeral values.
6629   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6630 
6631   // Ignore type-promoting instructions we identified during reduction
6632   // detection.
6633   for (auto &Reduction : Legal->getReductionVars()) {
6634     RecurrenceDescriptor &RedDes = Reduction.second;
6635     SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6636     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6637   }
6638   // Ignore type-casting instructions we identified during induction
6639   // detection.
6640   for (auto &Induction : Legal->getInductionVars()) {
6641     InductionDescriptor &IndDes = Induction.second;
6642     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6643     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6644   }
6645 }
6646 
6647 // TODO: we could return a pair of values that specify the max VF and
6648 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6649 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6650 // doesn't have a cost model that can choose which plan to execute if
6651 // more than one is generated.
6652 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6653                                  LoopVectorizationCostModel &CM) {
6654   unsigned WidestType;
6655   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6656   return WidestVectorRegBits / WidestType;
6657 }
6658 
6659 VectorizationFactor
6660 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
6661   unsigned VF = UserVF;
6662   // Outer loop handling: They may require CFG and instruction level
6663   // transformations before even evaluating whether vectorization is profitable.
6664   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6665   // the vectorization pipeline.
6666   if (!OrigLoop->empty()) {
6667     // If the user doesn't provide a vectorization factor, determine a
6668     // reasonable one.
6669     if (!UserVF) {
6670       VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM);
6671       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6672 
6673       // Make sure we have a VF > 1 for stress testing.
6674       if (VPlanBuildStressTest && VF < 2) {
6675         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6676                           << "overriding computed VF.\n");
6677         VF = 4;
6678       }
6679     }
6680     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6681     assert(isPowerOf2_32(VF) && "VF needs to be a power of two");
6682     LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF
6683                       << " to build VPlans.\n");
6684     buildVPlans(VF, VF);
6685 
6686     // For VPlan build stress testing, we bail out after VPlan construction.
6687     if (VPlanBuildStressTest)
6688       return VectorizationFactor::Disabled();
6689 
6690     return {VF, 0};
6691   }
6692 
6693   LLVM_DEBUG(
6694       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6695                 "VPlan-native path.\n");
6696   return VectorizationFactor::Disabled();
6697 }
6698 
6699 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF,
6700                                                              unsigned UserIC) {
6701   assert(OrigLoop->empty() && "Inner loop expected.");
6702   Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC);
6703   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
6704     return None;
6705 
6706   // Invalidate interleave groups if all blocks of loop will be predicated.
6707   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
6708       !useMaskedInterleavedAccesses(*TTI)) {
6709     LLVM_DEBUG(
6710         dbgs()
6711         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6712            "which requires masked-interleaved support.\n");
6713     if (CM.InterleaveInfo.invalidateGroups())
6714       // Invalidating interleave groups also requires invalidating all decisions
6715       // based on them, which includes widening decisions and uniform and scalar
6716       // values.
6717       CM.invalidateCostModelingDecisions();
6718   }
6719 
6720   if (UserVF) {
6721     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6722     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
6723     // Collect the instructions (and their associated costs) that will be more
6724     // profitable to scalarize.
6725     CM.selectUserVectorizationFactor(UserVF);
6726     buildVPlansWithVPRecipes(UserVF, UserVF);
6727     LLVM_DEBUG(printPlans(dbgs()));
6728     return {{UserVF, 0}};
6729   }
6730 
6731   unsigned MaxVF = MaybeMaxVF.getValue();
6732   assert(MaxVF != 0 && "MaxVF is zero.");
6733 
6734   for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
6735     // Collect Uniform and Scalar instructions after vectorization with VF.
6736     CM.collectUniformsAndScalars(VF);
6737 
6738     // Collect the instructions (and their associated costs) that will be more
6739     // profitable to scalarize.
6740     if (VF > 1)
6741       CM.collectInstsToScalarize(VF);
6742   }
6743 
6744   buildVPlansWithVPRecipes(1, MaxVF);
6745   LLVM_DEBUG(printPlans(dbgs()));
6746   if (MaxVF == 1)
6747     return VectorizationFactor::Disabled();
6748 
6749   // Select the optimal vectorization factor.
6750   return CM.selectVectorizationFactor(MaxVF);
6751 }
6752 
6753 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
6754   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
6755                     << '\n');
6756   BestVF = VF;
6757   BestUF = UF;
6758 
6759   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
6760     return !Plan->hasVF(VF);
6761   });
6762   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
6763 }
6764 
6765 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
6766                                            DominatorTree *DT) {
6767   // Perform the actual loop transformation.
6768 
6769   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
6770   VPCallbackILV CallbackILV(ILV);
6771 
6772   VPTransformState State{BestVF, BestUF,      LI,
6773                          DT,     ILV.Builder, ILV.VectorLoopValueMap,
6774                          &ILV,   CallbackILV};
6775   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
6776   State.TripCount = ILV.getOrCreateTripCount(nullptr);
6777   State.CanonicalIV = ILV.Induction;
6778 
6779   //===------------------------------------------------===//
6780   //
6781   // Notice: any optimization or new instruction that go
6782   // into the code below should also be implemented in
6783   // the cost-model.
6784   //
6785   //===------------------------------------------------===//
6786 
6787   // 2. Copy and widen instructions from the old loop into the new loop.
6788   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
6789   VPlans.front()->execute(&State);
6790 
6791   // 3. Fix the vectorized code: take care of header phi's, live-outs,
6792   //    predication, updating analyses.
6793   ILV.fixVectorizedLoop();
6794 }
6795 
6796 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
6797     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6798   BasicBlock *Latch = OrigLoop->getLoopLatch();
6799 
6800   // We create new control-flow for the vectorized loop, so the original
6801   // condition will be dead after vectorization if it's only used by the
6802   // branch.
6803   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
6804   if (Cmp && Cmp->hasOneUse())
6805     DeadInstructions.insert(Cmp);
6806 
6807   // We create new "steps" for induction variable updates to which the original
6808   // induction variables map. An original update instruction will be dead if
6809   // all its users except the induction variable are dead.
6810   for (auto &Induction : Legal->getInductionVars()) {
6811     PHINode *Ind = Induction.first;
6812     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
6813     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
6814           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
6815         }))
6816       DeadInstructions.insert(IndUpdate);
6817 
6818     // We record as "Dead" also the type-casting instructions we had identified
6819     // during induction analysis. We don't need any handling for them in the
6820     // vectorized loop because we have proven that, under a proper runtime
6821     // test guarding the vectorized loop, the value of the phi, and the casted
6822     // value of the phi, are the same. The last instruction in this casting chain
6823     // will get its scalar/vector/widened def from the scalar/vector/widened def
6824     // of the respective phi node. Any other casts in the induction def-use chain
6825     // have no other uses outside the phi update chain, and will be ignored.
6826     InductionDescriptor &IndDes = Induction.second;
6827     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6828     DeadInstructions.insert(Casts.begin(), Casts.end());
6829   }
6830 }
6831 
6832 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
6833 
6834 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
6835 
6836 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
6837                                         Instruction::BinaryOps BinOp) {
6838   // When unrolling and the VF is 1, we only need to add a simple scalar.
6839   Type *Ty = Val->getType();
6840   assert(!Ty->isVectorTy() && "Val must be a scalar");
6841 
6842   if (Ty->isFloatingPointTy()) {
6843     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
6844 
6845     // Floating point operations had to be 'fast' to enable the unrolling.
6846     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
6847     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
6848   }
6849   Constant *C = ConstantInt::get(Ty, StartIdx);
6850   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
6851 }
6852 
6853 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
6854   SmallVector<Metadata *, 4> MDs;
6855   // Reserve first location for self reference to the LoopID metadata node.
6856   MDs.push_back(nullptr);
6857   bool IsUnrollMetadata = false;
6858   MDNode *LoopID = L->getLoopID();
6859   if (LoopID) {
6860     // First find existing loop unrolling disable metadata.
6861     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
6862       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
6863       if (MD) {
6864         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
6865         IsUnrollMetadata =
6866             S && S->getString().startswith("llvm.loop.unroll.disable");
6867       }
6868       MDs.push_back(LoopID->getOperand(i));
6869     }
6870   }
6871 
6872   if (!IsUnrollMetadata) {
6873     // Add runtime unroll disable metadata.
6874     LLVMContext &Context = L->getHeader()->getContext();
6875     SmallVector<Metadata *, 1> DisableOperands;
6876     DisableOperands.push_back(
6877         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
6878     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
6879     MDs.push_back(DisableNode);
6880     MDNode *NewLoopID = MDNode::get(Context, MDs);
6881     // Set operand 0 to refer to the loop id itself.
6882     NewLoopID->replaceOperandWith(0, NewLoopID);
6883     L->setLoopID(NewLoopID);
6884   }
6885 }
6886 
6887 bool LoopVectorizationPlanner::getDecisionAndClampRange(
6888     const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
6889   assert(Range.End > Range.Start && "Trying to test an empty VF range.");
6890   bool PredicateAtRangeStart = Predicate(Range.Start);
6891 
6892   for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
6893     if (Predicate(TmpVF) != PredicateAtRangeStart) {
6894       Range.End = TmpVF;
6895       break;
6896     }
6897 
6898   return PredicateAtRangeStart;
6899 }
6900 
6901 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
6902 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
6903 /// of VF's starting at a given VF and extending it as much as possible. Each
6904 /// vectorization decision can potentially shorten this sub-range during
6905 /// buildVPlan().
6906 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
6907   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6908     VFRange SubRange = {VF, MaxVF + 1};
6909     VPlans.push_back(buildVPlan(SubRange));
6910     VF = SubRange.End;
6911   }
6912 }
6913 
6914 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
6915                                          VPlanPtr &Plan) {
6916   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
6917 
6918   // Look for cached value.
6919   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
6920   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
6921   if (ECEntryIt != EdgeMaskCache.end())
6922     return ECEntryIt->second;
6923 
6924   VPValue *SrcMask = createBlockInMask(Src, Plan);
6925 
6926   // The terminator has to be a branch inst!
6927   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
6928   assert(BI && "Unexpected terminator found");
6929 
6930   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
6931     return EdgeMaskCache[Edge] = SrcMask;
6932 
6933   VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
6934   assert(EdgeMask && "No Edge Mask found for condition");
6935 
6936   if (BI->getSuccessor(0) != Dst)
6937     EdgeMask = Builder.createNot(EdgeMask);
6938 
6939   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
6940     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
6941 
6942   return EdgeMaskCache[Edge] = EdgeMask;
6943 }
6944 
6945 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
6946   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
6947 
6948   // Look for cached value.
6949   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
6950   if (BCEntryIt != BlockMaskCache.end())
6951     return BCEntryIt->second;
6952 
6953   // All-one mask is modelled as no-mask following the convention for masked
6954   // load/store/gather/scatter. Initialize BlockMask to no-mask.
6955   VPValue *BlockMask = nullptr;
6956 
6957   if (OrigLoop->getHeader() == BB) {
6958     if (!CM.blockNeedsPredication(BB))
6959       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
6960 
6961     // Introduce the early-exit compare IV <= BTC to form header block mask.
6962     // This is used instead of IV < TC because TC may wrap, unlike BTC.
6963     // Start by constructing the desired canonical IV.
6964     VPValue *IV = nullptr;
6965     if (Legal->getPrimaryInduction())
6966       IV = Plan->getVPValue(Legal->getPrimaryInduction());
6967     else {
6968       auto IVRecipe = new VPWidenCanonicalIVRecipe();
6969       Builder.getInsertBlock()->appendRecipe(IVRecipe);
6970       IV = IVRecipe->getVPValue();
6971     }
6972     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
6973     bool TailFolded = !CM.isScalarEpilogueAllowed();
6974     if (TailFolded && CM.TTI.emitGetActiveLaneMask())
6975       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, BTC});
6976     else
6977       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
6978     return BlockMaskCache[BB] = BlockMask;
6979   }
6980 
6981   // This is the block mask. We OR all incoming edges.
6982   for (auto *Predecessor : predecessors(BB)) {
6983     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
6984     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
6985       return BlockMaskCache[BB] = EdgeMask;
6986 
6987     if (!BlockMask) { // BlockMask has its initialized nullptr value.
6988       BlockMask = EdgeMask;
6989       continue;
6990     }
6991 
6992     BlockMask = Builder.createOr(BlockMask, EdgeMask);
6993   }
6994 
6995   return BlockMaskCache[BB] = BlockMask;
6996 }
6997 
6998 VPWidenMemoryInstructionRecipe *
6999 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
7000                                   VPlanPtr &Plan) {
7001   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7002          "Must be called with either a load or store");
7003 
7004   auto willWiden = [&](unsigned VF) -> bool {
7005     if (VF == 1)
7006       return false;
7007     LoopVectorizationCostModel::InstWidening Decision =
7008         CM.getWideningDecision(I, VF);
7009     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
7010            "CM decision should be taken at this point.");
7011     if (Decision == LoopVectorizationCostModel::CM_Interleave)
7012       return true;
7013     if (CM.isScalarAfterVectorization(I, VF) ||
7014         CM.isProfitableToScalarize(I, VF))
7015       return false;
7016     return Decision != LoopVectorizationCostModel::CM_Scalarize;
7017   };
7018 
7019   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
7020     return nullptr;
7021 
7022   VPValue *Mask = nullptr;
7023   if (Legal->isMaskRequired(I))
7024     Mask = createBlockInMask(I->getParent(), Plan);
7025 
7026   VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
7027   if (LoadInst *Load = dyn_cast<LoadInst>(I))
7028     return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask);
7029 
7030   StoreInst *Store = cast<StoreInst>(I);
7031   VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand());
7032   return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask);
7033 }
7034 
7035 VPWidenIntOrFpInductionRecipe *
7036 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const {
7037   // Check if this is an integer or fp induction. If so, build the recipe that
7038   // produces its scalar and vector values.
7039   InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
7040   if (II.getKind() == InductionDescriptor::IK_IntInduction ||
7041       II.getKind() == InductionDescriptor::IK_FpInduction)
7042     return new VPWidenIntOrFpInductionRecipe(Phi);
7043 
7044   return nullptr;
7045 }
7046 
7047 VPWidenIntOrFpInductionRecipe *
7048 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I,
7049                                                 VFRange &Range) const {
7050   // Optimize the special case where the source is a constant integer
7051   // induction variable. Notice that we can only optimize the 'trunc' case
7052   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
7053   // (c) other casts depend on pointer size.
7054 
7055   // Determine whether \p K is a truncation based on an induction variable that
7056   // can be optimized.
7057   auto isOptimizableIVTruncate =
7058       [&](Instruction *K) -> std::function<bool(unsigned)> {
7059     return
7060         [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
7061   };
7062 
7063   if (LoopVectorizationPlanner::getDecisionAndClampRange(
7064           isOptimizableIVTruncate(I), Range))
7065     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
7066                                              I);
7067   return nullptr;
7068 }
7069 
7070 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) {
7071   // We know that all PHIs in non-header blocks are converted into selects, so
7072   // we don't have to worry about the insertion order and we can just use the
7073   // builder. At this point we generate the predication tree. There may be
7074   // duplications since this is a simple recursive scan, but future
7075   // optimizations will clean it up.
7076 
7077   SmallVector<VPValue *, 2> Operands;
7078   unsigned NumIncoming = Phi->getNumIncomingValues();
7079   for (unsigned In = 0; In < NumIncoming; In++) {
7080     VPValue *EdgeMask =
7081       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
7082     assert((EdgeMask || NumIncoming == 1) &&
7083            "Multiple predecessors with one having a full mask");
7084     Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In)));
7085     if (EdgeMask)
7086       Operands.push_back(EdgeMask);
7087   }
7088   return new VPBlendRecipe(Phi, Operands);
7089 }
7090 
7091 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range,
7092                                                    VPlan &Plan) const {
7093 
7094   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
7095       [this, CI](unsigned VF) { return CM.isScalarWithPredication(CI, VF); },
7096       Range);
7097 
7098   if (IsPredicated)
7099     return nullptr;
7100 
7101   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7102   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
7103              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
7104     return nullptr;
7105 
7106   auto willWiden = [&](unsigned VF) -> bool {
7107     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7108     // The following case may be scalarized depending on the VF.
7109     // The flag shows whether we use Intrinsic or a usual Call for vectorized
7110     // version of the instruction.
7111     // Is it beneficial to perform intrinsic call compared to lib call?
7112     bool NeedToScalarize = false;
7113     unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
7114     bool UseVectorIntrinsic =
7115         ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
7116     return UseVectorIntrinsic || !NeedToScalarize;
7117   };
7118 
7119   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
7120     return nullptr;
7121 
7122   return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands()));
7123 }
7124 
7125 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
7126   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
7127          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
7128   // Instruction should be widened, unless it is scalar after vectorization,
7129   // scalarization is profitable or it is predicated.
7130   auto WillScalarize = [this, I](unsigned VF) -> bool {
7131     return CM.isScalarAfterVectorization(I, VF) ||
7132            CM.isProfitableToScalarize(I, VF) ||
7133            CM.isScalarWithPredication(I, VF);
7134   };
7135   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
7136                                                              Range);
7137 }
7138 
7139 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const {
7140   auto IsVectorizableOpcode = [](unsigned Opcode) {
7141     switch (Opcode) {
7142     case Instruction::Add:
7143     case Instruction::And:
7144     case Instruction::AShr:
7145     case Instruction::BitCast:
7146     case Instruction::FAdd:
7147     case Instruction::FCmp:
7148     case Instruction::FDiv:
7149     case Instruction::FMul:
7150     case Instruction::FNeg:
7151     case Instruction::FPExt:
7152     case Instruction::FPToSI:
7153     case Instruction::FPToUI:
7154     case Instruction::FPTrunc:
7155     case Instruction::FRem:
7156     case Instruction::FSub:
7157     case Instruction::ICmp:
7158     case Instruction::IntToPtr:
7159     case Instruction::LShr:
7160     case Instruction::Mul:
7161     case Instruction::Or:
7162     case Instruction::PtrToInt:
7163     case Instruction::SDiv:
7164     case Instruction::Select:
7165     case Instruction::SExt:
7166     case Instruction::Shl:
7167     case Instruction::SIToFP:
7168     case Instruction::SRem:
7169     case Instruction::Sub:
7170     case Instruction::Trunc:
7171     case Instruction::UDiv:
7172     case Instruction::UIToFP:
7173     case Instruction::URem:
7174     case Instruction::Xor:
7175     case Instruction::ZExt:
7176       return true;
7177     }
7178     return false;
7179   };
7180 
7181   if (!IsVectorizableOpcode(I->getOpcode()))
7182     return nullptr;
7183 
7184   // Success: widen this instruction.
7185   return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands()));
7186 }
7187 
7188 VPBasicBlock *VPRecipeBuilder::handleReplication(
7189     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
7190     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
7191     VPlanPtr &Plan) {
7192   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
7193       [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
7194       Range);
7195 
7196   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
7197       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
7198 
7199   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
7200                                        IsUniform, IsPredicated);
7201   setRecipe(I, Recipe);
7202 
7203   // Find if I uses a predicated instruction. If so, it will use its scalar
7204   // value. Avoid hoisting the insert-element which packs the scalar value into
7205   // a vector value, as that happens iff all users use the vector value.
7206   for (auto &Op : I->operands())
7207     if (auto *PredInst = dyn_cast<Instruction>(Op))
7208       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
7209         PredInst2Recipe[PredInst]->setAlsoPack(false);
7210 
7211   // Finalize the recipe for Instr, first if it is not predicated.
7212   if (!IsPredicated) {
7213     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
7214     VPBB->appendRecipe(Recipe);
7215     return VPBB;
7216   }
7217   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
7218   assert(VPBB->getSuccessors().empty() &&
7219          "VPBB has successors when handling predicated replication.");
7220   // Record predicated instructions for above packing optimizations.
7221   PredInst2Recipe[I] = Recipe;
7222   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
7223   VPBlockUtils::insertBlockAfter(Region, VPBB);
7224   auto *RegSucc = new VPBasicBlock();
7225   VPBlockUtils::insertBlockAfter(RegSucc, Region);
7226   return RegSucc;
7227 }
7228 
7229 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
7230                                                       VPRecipeBase *PredRecipe,
7231                                                       VPlanPtr &Plan) {
7232   // Instructions marked for predication are replicated and placed under an
7233   // if-then construct to prevent side-effects.
7234 
7235   // Generate recipes to compute the block mask for this region.
7236   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
7237 
7238   // Build the triangular if-then region.
7239   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
7240   assert(Instr->getParent() && "Predicated instruction not in any basic block");
7241   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
7242   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
7243   auto *PHIRecipe =
7244       Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
7245   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
7246   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
7247   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
7248 
7249   // Note: first set Entry as region entry and then connect successors starting
7250   // from it in order, to propagate the "parent" of each VPBasicBlock.
7251   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
7252   VPBlockUtils::connectBlocks(Pred, Exit);
7253 
7254   return Region;
7255 }
7256 
7257 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
7258                                                       VFRange &Range,
7259                                                       VPlanPtr &Plan) {
7260   // First, check for specific widening recipes that deal with calls, memory
7261   // operations, inductions and Phi nodes.
7262   if (auto *CI = dyn_cast<CallInst>(Instr))
7263     return tryToWidenCall(CI, Range, *Plan);
7264 
7265   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
7266     return tryToWidenMemory(Instr, Range, Plan);
7267 
7268   VPRecipeBase *Recipe;
7269   if (auto Phi = dyn_cast<PHINode>(Instr)) {
7270     if (Phi->getParent() != OrigLoop->getHeader())
7271       return tryToBlend(Phi, Plan);
7272     if ((Recipe = tryToOptimizeInductionPHI(Phi)))
7273       return Recipe;
7274     return new VPWidenPHIRecipe(Phi);
7275   }
7276 
7277   if (isa<TruncInst>(Instr) &&
7278       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range)))
7279     return Recipe;
7280 
7281   if (!shouldWiden(Instr, Range))
7282     return nullptr;
7283 
7284   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
7285     return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()),
7286                                 OrigLoop);
7287 
7288   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
7289     bool InvariantCond =
7290         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
7291     return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()),
7292                                    InvariantCond);
7293   }
7294 
7295   return tryToWiden(Instr, *Plan);
7296 }
7297 
7298 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
7299                                                         unsigned MaxVF) {
7300   assert(OrigLoop->empty() && "Inner loop expected.");
7301 
7302   // Collect conditions feeding internal conditional branches; they need to be
7303   // represented in VPlan for it to model masking.
7304   SmallPtrSet<Value *, 1> NeedDef;
7305 
7306   auto *Latch = OrigLoop->getLoopLatch();
7307   for (BasicBlock *BB : OrigLoop->blocks()) {
7308     if (BB == Latch)
7309       continue;
7310     BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
7311     if (Branch && Branch->isConditional())
7312       NeedDef.insert(Branch->getCondition());
7313   }
7314 
7315   // If the tail is to be folded by masking, the primary induction variable, if
7316   // exists needs to be represented in VPlan for it to model early-exit masking.
7317   // Also, both the Phi and the live-out instruction of each reduction are
7318   // required in order to introduce a select between them in VPlan.
7319   if (CM.foldTailByMasking()) {
7320     if (Legal->getPrimaryInduction())
7321       NeedDef.insert(Legal->getPrimaryInduction());
7322     for (auto &Reduction : Legal->getReductionVars()) {
7323       NeedDef.insert(Reduction.first);
7324       NeedDef.insert(Reduction.second.getLoopExitInstr());
7325     }
7326   }
7327 
7328   // Collect instructions from the original loop that will become trivially dead
7329   // in the vectorized loop. We don't need to vectorize these instructions. For
7330   // example, original induction update instructions can become dead because we
7331   // separately emit induction "steps" when generating code for the new loop.
7332   // Similarly, we create a new latch condition when setting up the structure
7333   // of the new loop, so the old one can become dead.
7334   SmallPtrSet<Instruction *, 4> DeadInstructions;
7335   collectTriviallyDeadInstructions(DeadInstructions);
7336 
7337   // Add assume instructions we need to drop to DeadInstructions, to prevent
7338   // them from being added to the VPlan.
7339   // TODO: We only need to drop assumes in blocks that get flattend. If the
7340   // control flow is preserved, we should keep them.
7341   auto &ConditionalAssumes = Legal->getConditionalAssumes();
7342   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
7343 
7344   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
7345   // Dead instructions do not need sinking. Remove them from SinkAfter.
7346   for (Instruction *I : DeadInstructions)
7347     SinkAfter.erase(I);
7348 
7349   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
7350     VFRange SubRange = {VF, MaxVF + 1};
7351     VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef,
7352                                              DeadInstructions, SinkAfter));
7353     VF = SubRange.End;
7354   }
7355 }
7356 
7357 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
7358     VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
7359     SmallPtrSetImpl<Instruction *> &DeadInstructions,
7360     const DenseMap<Instruction *, Instruction *> &SinkAfter) {
7361 
7362   // Hold a mapping from predicated instructions to their recipes, in order to
7363   // fix their AlsoPack behavior if a user is determined to replicate and use a
7364   // scalar instead of vector value.
7365   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
7366 
7367   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
7368 
7369   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
7370 
7371   // ---------------------------------------------------------------------------
7372   // Pre-construction: record ingredients whose recipes we'll need to further
7373   // process after constructing the initial VPlan.
7374   // ---------------------------------------------------------------------------
7375 
7376   // Mark instructions we'll need to sink later and their targets as
7377   // ingredients whose recipe we'll need to record.
7378   for (auto &Entry : SinkAfter) {
7379     RecipeBuilder.recordRecipeOf(Entry.first);
7380     RecipeBuilder.recordRecipeOf(Entry.second);
7381   }
7382 
7383   // For each interleave group which is relevant for this (possibly trimmed)
7384   // Range, add it to the set of groups to be later applied to the VPlan and add
7385   // placeholders for its members' Recipes which we'll be replacing with a
7386   // single VPInterleaveRecipe.
7387   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
7388     auto applyIG = [IG, this](unsigned VF) -> bool {
7389       return (VF >= 2 && // Query is illegal for VF == 1
7390               CM.getWideningDecision(IG->getInsertPos(), VF) ==
7391                   LoopVectorizationCostModel::CM_Interleave);
7392     };
7393     if (!getDecisionAndClampRange(applyIG, Range))
7394       continue;
7395     InterleaveGroups.insert(IG);
7396     for (unsigned i = 0; i < IG->getFactor(); i++)
7397       if (Instruction *Member = IG->getMember(i))
7398         RecipeBuilder.recordRecipeOf(Member);
7399   };
7400 
7401   // ---------------------------------------------------------------------------
7402   // Build initial VPlan: Scan the body of the loop in a topological order to
7403   // visit each basic block after having visited its predecessor basic blocks.
7404   // ---------------------------------------------------------------------------
7405 
7406   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
7407   auto Plan = std::make_unique<VPlan>();
7408   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
7409   Plan->setEntry(VPBB);
7410 
7411   // Represent values that will have defs inside VPlan.
7412   for (Value *V : NeedDef)
7413     Plan->addVPValue(V);
7414 
7415   // Scan the body of the loop in a topological order to visit each basic block
7416   // after having visited its predecessor basic blocks.
7417   LoopBlocksDFS DFS(OrigLoop);
7418   DFS.perform(LI);
7419 
7420   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
7421     // Relevant instructions from basic block BB will be grouped into VPRecipe
7422     // ingredients and fill a new VPBasicBlock.
7423     unsigned VPBBsForBB = 0;
7424     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
7425     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
7426     VPBB = FirstVPBBForBB;
7427     Builder.setInsertPoint(VPBB);
7428 
7429     // Introduce each ingredient into VPlan.
7430     // TODO: Model and preserve debug instrinsics in VPlan.
7431     for (Instruction &I : BB->instructionsWithoutDebug()) {
7432       Instruction *Instr = &I;
7433 
7434       // First filter out irrelevant instructions, to ensure no recipes are
7435       // built for them.
7436       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
7437         continue;
7438 
7439       if (auto Recipe =
7440               RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) {
7441         RecipeBuilder.setRecipe(Instr, Recipe);
7442         VPBB->appendRecipe(Recipe);
7443         continue;
7444       }
7445 
7446       // Otherwise, if all widening options failed, Instruction is to be
7447       // replicated. This may create a successor for VPBB.
7448       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
7449           Instr, Range, VPBB, PredInst2Recipe, Plan);
7450       if (NextVPBB != VPBB) {
7451         VPBB = NextVPBB;
7452         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
7453                                     : "");
7454       }
7455     }
7456   }
7457 
7458   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7459   // may also be empty, such as the last one VPBB, reflecting original
7460   // basic-blocks with no recipes.
7461   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
7462   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
7463   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
7464   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
7465   delete PreEntry;
7466 
7467   // ---------------------------------------------------------------------------
7468   // Transform initial VPlan: Apply previously taken decisions, in order, to
7469   // bring the VPlan to its final state.
7470   // ---------------------------------------------------------------------------
7471 
7472   // Apply Sink-After legal constraints.
7473   for (auto &Entry : SinkAfter) {
7474     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
7475     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
7476     Sink->moveAfter(Target);
7477   }
7478 
7479   // Interleave memory: for each Interleave Group we marked earlier as relevant
7480   // for this VPlan, replace the Recipes widening its memory instructions with a
7481   // single VPInterleaveRecipe at its insertion point.
7482   for (auto IG : InterleaveGroups) {
7483     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
7484         RecipeBuilder.getRecipe(IG->getInsertPos()));
7485     (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask()))
7486         ->insertBefore(Recipe);
7487 
7488     for (unsigned i = 0; i < IG->getFactor(); ++i)
7489       if (Instruction *Member = IG->getMember(i)) {
7490         RecipeBuilder.getRecipe(Member)->eraseFromParent();
7491       }
7492   }
7493 
7494   // Finally, if tail is folded by masking, introduce selects between the phi
7495   // and the live-out instruction of each reduction, at the end of the latch.
7496   if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) {
7497     Builder.setInsertPoint(VPBB);
7498     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
7499     for (auto &Reduction : Legal->getReductionVars()) {
7500       VPValue *Phi = Plan->getVPValue(Reduction.first);
7501       VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
7502       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
7503     }
7504   }
7505 
7506   std::string PlanName;
7507   raw_string_ostream RSO(PlanName);
7508   unsigned VF = Range.Start;
7509   Plan->addVF(VF);
7510   RSO << "Initial VPlan for VF={" << VF;
7511   for (VF *= 2; VF < Range.End; VF *= 2) {
7512     Plan->addVF(VF);
7513     RSO << "," << VF;
7514   }
7515   RSO << "},UF>=1";
7516   RSO.flush();
7517   Plan->setName(PlanName);
7518 
7519   return Plan;
7520 }
7521 
7522 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
7523   // Outer loop handling: They may require CFG and instruction level
7524   // transformations before even evaluating whether vectorization is profitable.
7525   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7526   // the vectorization pipeline.
7527   assert(!OrigLoop->empty());
7528   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7529 
7530   // Create new empty VPlan
7531   auto Plan = std::make_unique<VPlan>();
7532 
7533   // Build hierarchical CFG
7534   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
7535   HCFGBuilder.buildHierarchicalCFG();
7536 
7537   for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
7538     Plan->addVF(VF);
7539 
7540   if (EnableVPlanPredication) {
7541     VPlanPredicator VPP(*Plan);
7542     VPP.predicate();
7543 
7544     // Avoid running transformation to recipes until masked code generation in
7545     // VPlan-native path is in place.
7546     return Plan;
7547   }
7548 
7549   SmallPtrSet<Instruction *, 1> DeadInstructions;
7550   VPlanTransforms::VPInstructionsToVPRecipes(
7551       OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
7552   return Plan;
7553 }
7554 
7555 Value* LoopVectorizationPlanner::VPCallbackILV::
7556 getOrCreateVectorValues(Value *V, unsigned Part) {
7557       return ILV.getOrCreateVectorValue(V, Part);
7558 }
7559 
7560 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
7561     Value *V, const VPIteration &Instance) {
7562   return ILV.getOrCreateScalarValue(V, Instance);
7563 }
7564 
7565 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
7566                                VPSlotTracker &SlotTracker) const {
7567   O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
7568   IG->getInsertPos()->printAsOperand(O, false);
7569   O << ", ";
7570   getAddr()->printAsOperand(O, SlotTracker);
7571   VPValue *Mask = getMask();
7572   if (Mask) {
7573     O << ", ";
7574     Mask->printAsOperand(O, SlotTracker);
7575   }
7576   for (unsigned i = 0; i < IG->getFactor(); ++i)
7577     if (Instruction *I = IG->getMember(i))
7578       O << "\\l\" +\n" << Indent << "\"  " << VPlanIngredient(I) << " " << i;
7579 }
7580 
7581 void VPWidenCallRecipe::execute(VPTransformState &State) {
7582   State.ILV->widenCallInstruction(Ingredient, User, State);
7583 }
7584 
7585 void VPWidenSelectRecipe::execute(VPTransformState &State) {
7586   State.ILV->widenSelectInstruction(Ingredient, User, InvariantCond, State);
7587 }
7588 
7589 void VPWidenRecipe::execute(VPTransformState &State) {
7590   State.ILV->widenInstruction(Ingredient, User, State);
7591 }
7592 
7593 void VPWidenGEPRecipe::execute(VPTransformState &State) {
7594   State.ILV->widenGEP(GEP, User, State.UF, State.VF, IsPtrLoopInvariant,
7595                       IsIndexLoopInvariant, State);
7596 }
7597 
7598 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
7599   assert(!State.Instance && "Int or FP induction being replicated.");
7600   State.ILV->widenIntOrFpInduction(IV, Trunc);
7601 }
7602 
7603 void VPWidenPHIRecipe::execute(VPTransformState &State) {
7604   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7605 }
7606 
7607 void VPBlendRecipe::execute(VPTransformState &State) {
7608   State.ILV->setDebugLocFromInst(State.Builder, Phi);
7609   // We know that all PHIs in non-header blocks are converted into
7610   // selects, so we don't have to worry about the insertion order and we
7611   // can just use the builder.
7612   // At this point we generate the predication tree. There may be
7613   // duplications since this is a simple recursive scan, but future
7614   // optimizations will clean it up.
7615 
7616   unsigned NumIncoming = getNumIncomingValues();
7617 
7618   // Generate a sequence of selects of the form:
7619   // SELECT(Mask3, In3,
7620   //        SELECT(Mask2, In2,
7621   //               SELECT(Mask1, In1,
7622   //                      In0)))
7623   // Note that Mask0 is never used: lanes for which no path reaches this phi and
7624   // are essentially undef are taken from In0.
7625   InnerLoopVectorizer::VectorParts Entry(State.UF);
7626   for (unsigned In = 0; In < NumIncoming; ++In) {
7627     for (unsigned Part = 0; Part < State.UF; ++Part) {
7628       // We might have single edge PHIs (blocks) - use an identity
7629       // 'select' for the first PHI operand.
7630       Value *In0 = State.get(getIncomingValue(In), Part);
7631       if (In == 0)
7632         Entry[Part] = In0; // Initialize with the first incoming value.
7633       else {
7634         // Select between the current value and the previous incoming edge
7635         // based on the incoming mask.
7636         Value *Cond = State.get(getMask(In), Part);
7637         Entry[Part] =
7638             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
7639       }
7640     }
7641   }
7642   for (unsigned Part = 0; Part < State.UF; ++Part)
7643     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
7644 }
7645 
7646 void VPInterleaveRecipe::execute(VPTransformState &State) {
7647   assert(!State.Instance && "Interleave group being replicated.");
7648   State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getMask());
7649 }
7650 
7651 void VPReplicateRecipe::execute(VPTransformState &State) {
7652   if (State.Instance) { // Generate a single instance.
7653     State.ILV->scalarizeInstruction(Ingredient, User, *State.Instance,
7654                                     IsPredicated, State);
7655     // Insert scalar instance packing it into a vector.
7656     if (AlsoPack && State.VF > 1) {
7657       // If we're constructing lane 0, initialize to start from undef.
7658       if (State.Instance->Lane == 0) {
7659         Value *Undef = UndefValue::get(
7660             FixedVectorType::get(Ingredient->getType(), State.VF));
7661         State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
7662       }
7663       State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
7664     }
7665     return;
7666   }
7667 
7668   // Generate scalar instances for all VF lanes of all UF parts, unless the
7669   // instruction is uniform inwhich case generate only the first lane for each
7670   // of the UF parts.
7671   unsigned EndLane = IsUniform ? 1 : State.VF;
7672   for (unsigned Part = 0; Part < State.UF; ++Part)
7673     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
7674       State.ILV->scalarizeInstruction(Ingredient, User, {Part, Lane},
7675                                       IsPredicated, State);
7676 }
7677 
7678 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
7679   assert(State.Instance && "Branch on Mask works only on single instance.");
7680 
7681   unsigned Part = State.Instance->Part;
7682   unsigned Lane = State.Instance->Lane;
7683 
7684   Value *ConditionBit = nullptr;
7685   VPValue *BlockInMask = getMask();
7686   if (BlockInMask) {
7687     ConditionBit = State.get(BlockInMask, Part);
7688     if (ConditionBit->getType()->isVectorTy())
7689       ConditionBit = State.Builder.CreateExtractElement(
7690           ConditionBit, State.Builder.getInt32(Lane));
7691   } else // Block in mask is all-one.
7692     ConditionBit = State.Builder.getTrue();
7693 
7694   // Replace the temporary unreachable terminator with a new conditional branch,
7695   // whose two destinations will be set later when they are created.
7696   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
7697   assert(isa<UnreachableInst>(CurrentTerminator) &&
7698          "Expected to replace unreachable terminator with conditional branch.");
7699   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
7700   CondBr->setSuccessor(0, nullptr);
7701   ReplaceInstWithInst(CurrentTerminator, CondBr);
7702 }
7703 
7704 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
7705   assert(State.Instance && "Predicated instruction PHI works per instance.");
7706   Instruction *ScalarPredInst = cast<Instruction>(
7707       State.ValueMap.getScalarValue(PredInst, *State.Instance));
7708   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
7709   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
7710   assert(PredicatingBB && "Predicated block has no single predecessor.");
7711 
7712   // By current pack/unpack logic we need to generate only a single phi node: if
7713   // a vector value for the predicated instruction exists at this point it means
7714   // the instruction has vector users only, and a phi for the vector value is
7715   // needed. In this case the recipe of the predicated instruction is marked to
7716   // also do that packing, thereby "hoisting" the insert-element sequence.
7717   // Otherwise, a phi node for the scalar value is needed.
7718   unsigned Part = State.Instance->Part;
7719   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
7720     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
7721     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
7722     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
7723     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
7724     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
7725     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
7726   } else {
7727     Type *PredInstType = PredInst->getType();
7728     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
7729     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
7730     Phi->addIncoming(ScalarPredInst, PredicatedBB);
7731     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
7732   }
7733 }
7734 
7735 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
7736   VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr;
7737   State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue,
7738                                         getMask());
7739 }
7740 
7741 // Determine how to lower the scalar epilogue, which depends on 1) optimising
7742 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
7743 // predication, and 4) a TTI hook that analyses whether the loop is suitable
7744 // for predication.
7745 static ScalarEpilogueLowering getScalarEpilogueLowering(
7746     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
7747     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
7748     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
7749     LoopVectorizationLegality &LVL) {
7750   bool OptSize =
7751       F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
7752                                                      PGSOQueryType::IRPass);
7753   // 1) OptSize takes precedence over all other options, i.e. if this is set,
7754   // don't look at hints or options, and don't request a scalar epilogue.
7755   if (OptSize)
7756     return CM_ScalarEpilogueNotAllowedOptSize;
7757 
7758   bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() &&
7759                               !PreferPredicateOverEpilog;
7760 
7761   // 2) Next, if disabling predication is requested on the command line, honour
7762   // this and request a scalar epilogue.
7763   if (PredicateOptDisabled)
7764     return CM_ScalarEpilogueAllowed;
7765 
7766   // 3) and 4) look if enabling predication is requested on the command line,
7767   // with a loop hint, or if the TTI hook indicates this is profitable, request
7768   // predication .
7769   if (PreferPredicateOverEpilog ||
7770       Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
7771       (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
7772                                         LVL.getLAI()) &&
7773        Hints.getPredicate() != LoopVectorizeHints::FK_Disabled))
7774     return CM_ScalarEpilogueNotNeededUsePredicate;
7775 
7776   return CM_ScalarEpilogueAllowed;
7777 }
7778 
7779 // Process the loop in the VPlan-native vectorization path. This path builds
7780 // VPlan upfront in the vectorization pipeline, which allows to apply
7781 // VPlan-to-VPlan transformations from the very beginning without modifying the
7782 // input LLVM IR.
7783 static bool processLoopInVPlanNativePath(
7784     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
7785     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
7786     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
7787     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
7788     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
7789 
7790   if (PSE.getBackedgeTakenCount() == PSE.getSE()->getCouldNotCompute()) {
7791     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
7792     return false;
7793   }
7794   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
7795   Function *F = L->getHeader()->getParent();
7796   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
7797 
7798   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
7799       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
7800 
7801   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
7802                                 &Hints, IAI);
7803   // Use the planner for outer loop vectorization.
7804   // TODO: CM is not used at this point inside the planner. Turn CM into an
7805   // optional argument if we don't need it in the future.
7806   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE);
7807 
7808   // Get user vectorization factor.
7809   const unsigned UserVF = Hints.getWidth();
7810 
7811   // Plan how to best vectorize, return the best VF and its cost.
7812   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
7813 
7814   // If we are stress testing VPlan builds, do not attempt to generate vector
7815   // code. Masked vector code generation support will follow soon.
7816   // Also, do not attempt to vectorize if no vector code will be produced.
7817   if (VPlanBuildStressTest || EnableVPlanPredication ||
7818       VectorizationFactor::Disabled() == VF)
7819     return false;
7820 
7821   LVP.setBestPlan(VF.Width, 1);
7822 
7823   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
7824                          &CM, BFI, PSI);
7825   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
7826                     << L->getHeader()->getParent()->getName() << "\"\n");
7827   LVP.executePlan(LB, DT);
7828 
7829   // Mark the loop as already vectorized to avoid vectorizing again.
7830   Hints.setAlreadyVectorized();
7831 
7832   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
7833   return true;
7834 }
7835 
7836 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
7837     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
7838                                !EnableLoopInterleaving),
7839       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
7840                               !EnableLoopVectorization) {}
7841 
7842 bool LoopVectorizePass::processLoop(Loop *L) {
7843   assert((EnableVPlanNativePath || L->empty()) &&
7844          "VPlan-native path is not enabled. Only process inner loops.");
7845 
7846 #ifndef NDEBUG
7847   const std::string DebugLocStr = getDebugLocString(L);
7848 #endif /* NDEBUG */
7849 
7850   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
7851                     << L->getHeader()->getParent()->getName() << "\" from "
7852                     << DebugLocStr << "\n");
7853 
7854   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
7855 
7856   LLVM_DEBUG(
7857       dbgs() << "LV: Loop hints:"
7858              << " force="
7859              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
7860                      ? "disabled"
7861                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
7862                             ? "enabled"
7863                             : "?"))
7864              << " width=" << Hints.getWidth()
7865              << " unroll=" << Hints.getInterleave() << "\n");
7866 
7867   // Function containing loop
7868   Function *F = L->getHeader()->getParent();
7869 
7870   // Looking at the diagnostic output is the only way to determine if a loop
7871   // was vectorized (other than looking at the IR or machine code), so it
7872   // is important to generate an optimization remark for each loop. Most of
7873   // these messages are generated as OptimizationRemarkAnalysis. Remarks
7874   // generated as OptimizationRemark and OptimizationRemarkMissed are
7875   // less verbose reporting vectorized loops and unvectorized loops that may
7876   // benefit from vectorization, respectively.
7877 
7878   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
7879     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
7880     return false;
7881   }
7882 
7883   PredicatedScalarEvolution PSE(*SE, *L);
7884 
7885   // Check if it is legal to vectorize the loop.
7886   LoopVectorizationRequirements Requirements(*ORE);
7887   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
7888                                 &Requirements, &Hints, DB, AC, BFI, PSI);
7889   if (!LVL.canVectorize(EnableVPlanNativePath)) {
7890     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
7891     Hints.emitRemarkWithHints();
7892     return false;
7893   }
7894 
7895   // Check the function attributes and profiles to find out if this function
7896   // should be optimized for size.
7897   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
7898       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
7899 
7900   // Entrance to the VPlan-native vectorization path. Outer loops are processed
7901   // here. They may require CFG and instruction level transformations before
7902   // even evaluating whether vectorization is profitable. Since we cannot modify
7903   // the incoming IR, we need to build VPlan upfront in the vectorization
7904   // pipeline.
7905   if (!L->empty())
7906     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
7907                                         ORE, BFI, PSI, Hints);
7908 
7909   assert(L->empty() && "Inner loop expected.");
7910 
7911   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
7912   // count by optimizing for size, to minimize overheads.
7913   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
7914   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
7915     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
7916                       << "This loop is worth vectorizing only if no scalar "
7917                       << "iteration overheads are incurred.");
7918     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
7919       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
7920     else {
7921       LLVM_DEBUG(dbgs() << "\n");
7922       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
7923     }
7924   }
7925 
7926   // Check the function attributes to see if implicit floats are allowed.
7927   // FIXME: This check doesn't seem possibly correct -- what if the loop is
7928   // an integer loop and the vector instructions selected are purely integer
7929   // vector instructions?
7930   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
7931     reportVectorizationFailure(
7932         "Can't vectorize when the NoImplicitFloat attribute is used",
7933         "loop not vectorized due to NoImplicitFloat attribute",
7934         "NoImplicitFloat", ORE, L);
7935     Hints.emitRemarkWithHints();
7936     return false;
7937   }
7938 
7939   // Check if the target supports potentially unsafe FP vectorization.
7940   // FIXME: Add a check for the type of safety issue (denormal, signaling)
7941   // for the target we're vectorizing for, to make sure none of the
7942   // additional fp-math flags can help.
7943   if (Hints.isPotentiallyUnsafe() &&
7944       TTI->isFPVectorizationPotentiallyUnsafe()) {
7945     reportVectorizationFailure(
7946         "Potentially unsafe FP op prevents vectorization",
7947         "loop not vectorized due to unsafe FP support.",
7948         "UnsafeFP", ORE, L);
7949     Hints.emitRemarkWithHints();
7950     return false;
7951   }
7952 
7953   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
7954   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
7955 
7956   // If an override option has been passed in for interleaved accesses, use it.
7957   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
7958     UseInterleaved = EnableInterleavedMemAccesses;
7959 
7960   // Analyze interleaved memory accesses.
7961   if (UseInterleaved) {
7962     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
7963   }
7964 
7965   // Use the cost model.
7966   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
7967                                 F, &Hints, IAI);
7968   CM.collectValuesToIgnore();
7969 
7970   // Use the planner for vectorization.
7971   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);
7972 
7973   // Get user vectorization factor and interleave count.
7974   unsigned UserVF = Hints.getWidth();
7975   unsigned UserIC = Hints.getInterleave();
7976 
7977   // Plan how to best vectorize, return the best VF and its cost.
7978   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
7979 
7980   VectorizationFactor VF = VectorizationFactor::Disabled();
7981   unsigned IC = 1;
7982 
7983   if (MaybeVF) {
7984     VF = *MaybeVF;
7985     // Select the interleave count.
7986     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
7987   }
7988 
7989   // Identify the diagnostic messages that should be produced.
7990   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
7991   bool VectorizeLoop = true, InterleaveLoop = true;
7992   if (Requirements.doesNotMeet(F, L, Hints)) {
7993     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
7994                          "requirements.\n");
7995     Hints.emitRemarkWithHints();
7996     return false;
7997   }
7998 
7999   if (VF.Width == 1) {
8000     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
8001     VecDiagMsg = std::make_pair(
8002         "VectorizationNotBeneficial",
8003         "the cost-model indicates that vectorization is not beneficial");
8004     VectorizeLoop = false;
8005   }
8006 
8007   if (!MaybeVF && UserIC > 1) {
8008     // Tell the user interleaving was avoided up-front, despite being explicitly
8009     // requested.
8010     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
8011                          "interleaving should be avoided up front\n");
8012     IntDiagMsg = std::make_pair(
8013         "InterleavingAvoided",
8014         "Ignoring UserIC, because interleaving was avoided up front");
8015     InterleaveLoop = false;
8016   } else if (IC == 1 && UserIC <= 1) {
8017     // Tell the user interleaving is not beneficial.
8018     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
8019     IntDiagMsg = std::make_pair(
8020         "InterleavingNotBeneficial",
8021         "the cost-model indicates that interleaving is not beneficial");
8022     InterleaveLoop = false;
8023     if (UserIC == 1) {
8024       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
8025       IntDiagMsg.second +=
8026           " and is explicitly disabled or interleave count is set to 1";
8027     }
8028   } else if (IC > 1 && UserIC == 1) {
8029     // Tell the user interleaving is beneficial, but it explicitly disabled.
8030     LLVM_DEBUG(
8031         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
8032     IntDiagMsg = std::make_pair(
8033         "InterleavingBeneficialButDisabled",
8034         "the cost-model indicates that interleaving is beneficial "
8035         "but is explicitly disabled or interleave count is set to 1");
8036     InterleaveLoop = false;
8037   }
8038 
8039   // Override IC if user provided an interleave count.
8040   IC = UserIC > 0 ? UserIC : IC;
8041 
8042   // Emit diagnostic messages, if any.
8043   const char *VAPassName = Hints.vectorizeAnalysisPassName();
8044   if (!VectorizeLoop && !InterleaveLoop) {
8045     // Do not vectorize or interleaving the loop.
8046     ORE->emit([&]() {
8047       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
8048                                       L->getStartLoc(), L->getHeader())
8049              << VecDiagMsg.second;
8050     });
8051     ORE->emit([&]() {
8052       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
8053                                       L->getStartLoc(), L->getHeader())
8054              << IntDiagMsg.second;
8055     });
8056     return false;
8057   } else if (!VectorizeLoop && InterleaveLoop) {
8058     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
8059     ORE->emit([&]() {
8060       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
8061                                         L->getStartLoc(), L->getHeader())
8062              << VecDiagMsg.second;
8063     });
8064   } else if (VectorizeLoop && !InterleaveLoop) {
8065     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
8066                       << ") in " << DebugLocStr << '\n');
8067     ORE->emit([&]() {
8068       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
8069                                         L->getStartLoc(), L->getHeader())
8070              << IntDiagMsg.second;
8071     });
8072   } else if (VectorizeLoop && InterleaveLoop) {
8073     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
8074                       << ") in " << DebugLocStr << '\n');
8075     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
8076   }
8077 
8078   LVP.setBestPlan(VF.Width, IC);
8079 
8080   using namespace ore;
8081   bool DisableRuntimeUnroll = false;
8082   MDNode *OrigLoopID = L->getLoopID();
8083 
8084   if (!VectorizeLoop) {
8085     assert(IC > 1 && "interleave count should not be 1 or 0");
8086     // If we decided that it is not legal to vectorize the loop, then
8087     // interleave it.
8088     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM,
8089                                BFI, PSI);
8090     LVP.executePlan(Unroller, DT);
8091 
8092     ORE->emit([&]() {
8093       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
8094                                 L->getHeader())
8095              << "interleaved loop (interleaved count: "
8096              << NV("InterleaveCount", IC) << ")";
8097     });
8098   } else {
8099     // If we decided that it is *legal* to vectorize the loop, then do it.
8100     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
8101                            &LVL, &CM, BFI, PSI);
8102     LVP.executePlan(LB, DT);
8103     ++LoopsVectorized;
8104 
8105     // Add metadata to disable runtime unrolling a scalar loop when there are
8106     // no runtime checks about strides and memory. A scalar loop that is
8107     // rarely used is not worth unrolling.
8108     if (!LB.areSafetyChecksAdded())
8109       DisableRuntimeUnroll = true;
8110 
8111     // Report the vectorization decision.
8112     ORE->emit([&]() {
8113       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
8114                                 L->getHeader())
8115              << "vectorized loop (vectorization width: "
8116              << NV("VectorizationFactor", VF.Width)
8117              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
8118     });
8119   }
8120 
8121   Optional<MDNode *> RemainderLoopID =
8122       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
8123                                       LLVMLoopVectorizeFollowupEpilogue});
8124   if (RemainderLoopID.hasValue()) {
8125     L->setLoopID(RemainderLoopID.getValue());
8126   } else {
8127     if (DisableRuntimeUnroll)
8128       AddRuntimeUnrollDisableMetaData(L);
8129 
8130     // Mark the loop as already vectorized to avoid vectorizing again.
8131     Hints.setAlreadyVectorized();
8132   }
8133 
8134   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
8135   return true;
8136 }
8137 
8138 LoopVectorizeResult LoopVectorizePass::runImpl(
8139     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
8140     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
8141     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
8142     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
8143     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
8144   SE = &SE_;
8145   LI = &LI_;
8146   TTI = &TTI_;
8147   DT = &DT_;
8148   BFI = &BFI_;
8149   TLI = TLI_;
8150   AA = &AA_;
8151   AC = &AC_;
8152   GetLAA = &GetLAA_;
8153   DB = &DB_;
8154   ORE = &ORE_;
8155   PSI = PSI_;
8156 
8157   // Don't attempt if
8158   // 1. the target claims to have no vector registers, and
8159   // 2. interleaving won't help ILP.
8160   //
8161   // The second condition is necessary because, even if the target has no
8162   // vector registers, loop vectorization may still enable scalar
8163   // interleaving.
8164   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
8165       TTI->getMaxInterleaveFactor(1) < 2)
8166     return LoopVectorizeResult(false, false);
8167 
8168   bool Changed = false, CFGChanged = false;
8169 
8170   // The vectorizer requires loops to be in simplified form.
8171   // Since simplification may add new inner loops, it has to run before the
8172   // legality and profitability checks. This means running the loop vectorizer
8173   // will simplify all loops, regardless of whether anything end up being
8174   // vectorized.
8175   for (auto &L : *LI)
8176     Changed |= CFGChanged |=
8177         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
8178 
8179   // Build up a worklist of inner-loops to vectorize. This is necessary as
8180   // the act of vectorizing or partially unrolling a loop creates new loops
8181   // and can invalidate iterators across the loops.
8182   SmallVector<Loop *, 8> Worklist;
8183 
8184   for (Loop *L : *LI)
8185     collectSupportedLoops(*L, LI, ORE, Worklist);
8186 
8187   LoopsAnalyzed += Worklist.size();
8188 
8189   // Now walk the identified inner loops.
8190   while (!Worklist.empty()) {
8191     Loop *L = Worklist.pop_back_val();
8192 
8193     // For the inner loops we actually process, form LCSSA to simplify the
8194     // transform.
8195     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
8196 
8197     Changed |= CFGChanged |= processLoop(L);
8198   }
8199 
8200   // Process each loop nest in the function.
8201   return LoopVectorizeResult(Changed, CFGChanged);
8202 }
8203 
8204 PreservedAnalyses LoopVectorizePass::run(Function &F,
8205                                          FunctionAnalysisManager &AM) {
8206     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
8207     auto &LI = AM.getResult<LoopAnalysis>(F);
8208     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
8209     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
8210     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
8211     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
8212     auto &AA = AM.getResult<AAManager>(F);
8213     auto &AC = AM.getResult<AssumptionAnalysis>(F);
8214     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
8215     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
8216     MemorySSA *MSSA = EnableMSSALoopDependency
8217                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
8218                           : nullptr;
8219 
8220     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
8221     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
8222         [&](Loop &L) -> const LoopAccessInfo & {
8223       LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
8224       return LAM.getResult<LoopAccessAnalysis>(L, AR);
8225     };
8226     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
8227     ProfileSummaryInfo *PSI =
8228         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
8229     LoopVectorizeResult Result =
8230         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
8231     if (!Result.MadeAnyChange)
8232       return PreservedAnalyses::all();
8233     PreservedAnalyses PA;
8234 
8235     // We currently do not preserve loopinfo/dominator analyses with outer loop
8236     // vectorization. Until this is addressed, mark these analyses as preserved
8237     // only for non-VPlan-native path.
8238     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
8239     if (!EnableVPlanNativePath) {
8240       PA.preserve<LoopAnalysis>();
8241       PA.preserve<DominatorTreeAnalysis>();
8242     }
8243     PA.preserve<BasicAA>();
8244     PA.preserve<GlobalsAA>();
8245     if (!Result.MadeCFGChange)
8246       PA.preserveSet<CFGAnalyses>();
8247     return PA;
8248 }
8249