10b57cec5SDimitry Andric //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric //
90b57cec5SDimitry Andric // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
100b57cec5SDimitry Andric // and generates target-independent LLVM-IR.
110b57cec5SDimitry Andric // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
120b57cec5SDimitry Andric // of instructions in order to estimate the profitability of vectorization.
130b57cec5SDimitry Andric //
140b57cec5SDimitry Andric // The loop vectorizer combines consecutive loop iterations into a single
150b57cec5SDimitry Andric // 'wide' iteration. After this transformation the index is incremented
160b57cec5SDimitry Andric // by the SIMD vector width, and not by one.
170b57cec5SDimitry Andric //
180b57cec5SDimitry Andric // This pass has three parts:
190b57cec5SDimitry Andric // 1. The main loop pass that drives the different parts.
200b57cec5SDimitry Andric // 2. LoopVectorizationLegality - A unit that checks for the legality
210b57cec5SDimitry Andric //    of the vectorization.
220b57cec5SDimitry Andric // 3. InnerLoopVectorizer - A unit that performs the actual
230b57cec5SDimitry Andric //    widening of instructions.
240b57cec5SDimitry Andric // 4. LoopVectorizationCostModel - A unit that checks for the profitability
250b57cec5SDimitry Andric //    of vectorization. It decides on the optimal vector width, which
260b57cec5SDimitry Andric //    can be one, if vectorization is not profitable.
270b57cec5SDimitry Andric //
280b57cec5SDimitry Andric // There is a development effort going on to migrate loop vectorizer to the
290b57cec5SDimitry Andric // VPlan infrastructure and to introduce outer loop vectorization support (see
30c9157d92SDimitry Andric // docs/VectorizationPlan.rst and
310b57cec5SDimitry Andric // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
320b57cec5SDimitry Andric // purpose, we temporarily introduced the VPlan-native vectorization path: an
330b57cec5SDimitry Andric // alternative vectorization path that is natively implemented on top of the
340b57cec5SDimitry Andric // VPlan infrastructure. See EnableVPlanNativePath for enabling.
350b57cec5SDimitry Andric //
360b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
370b57cec5SDimitry Andric //
380b57cec5SDimitry Andric // The reduction-variable vectorization is based on the paper:
390b57cec5SDimitry Andric //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
400b57cec5SDimitry Andric //
410b57cec5SDimitry Andric // Variable uniformity checks are inspired by:
420b57cec5SDimitry Andric //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
430b57cec5SDimitry Andric //
440b57cec5SDimitry Andric // The interleaved access vectorization is based on the paper:
450b57cec5SDimitry Andric //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
460b57cec5SDimitry Andric //  Data for SIMD
470b57cec5SDimitry Andric //
480b57cec5SDimitry Andric // Other ideas/concepts are from:
490b57cec5SDimitry Andric //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
500b57cec5SDimitry Andric //
510b57cec5SDimitry Andric //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
520b57cec5SDimitry Andric //  Vectorizing Compilers.
530b57cec5SDimitry Andric //
540b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
550b57cec5SDimitry Andric 
560b57cec5SDimitry Andric #include "llvm/Transforms/Vectorize/LoopVectorize.h"
570b57cec5SDimitry Andric #include "LoopVectorizationPlanner.h"
580b57cec5SDimitry Andric #include "VPRecipeBuilder.h"
590b57cec5SDimitry Andric #include "VPlan.h"
60c9157d92SDimitry Andric #include "VPlanAnalysis.h"
610b57cec5SDimitry Andric #include "VPlanHCFGBuilder.h"
62480093f4SDimitry Andric #include "VPlanTransforms.h"
630b57cec5SDimitry Andric #include "llvm/ADT/APInt.h"
640b57cec5SDimitry Andric #include "llvm/ADT/ArrayRef.h"
650b57cec5SDimitry Andric #include "llvm/ADT/DenseMap.h"
660b57cec5SDimitry Andric #include "llvm/ADT/DenseMapInfo.h"
670b57cec5SDimitry Andric #include "llvm/ADT/Hashing.h"
680b57cec5SDimitry Andric #include "llvm/ADT/MapVector.h"
690b57cec5SDimitry Andric #include "llvm/ADT/STLExtras.h"
700b57cec5SDimitry Andric #include "llvm/ADT/SmallPtrSet.h"
71fe6060f1SDimitry Andric #include "llvm/ADT/SmallSet.h"
720b57cec5SDimitry Andric #include "llvm/ADT/SmallVector.h"
730b57cec5SDimitry Andric #include "llvm/ADT/Statistic.h"
740b57cec5SDimitry Andric #include "llvm/ADT/StringRef.h"
750b57cec5SDimitry Andric #include "llvm/ADT/Twine.h"
760b57cec5SDimitry Andric #include "llvm/ADT/iterator_range.h"
770b57cec5SDimitry Andric #include "llvm/Analysis/AssumptionCache.h"
780b57cec5SDimitry Andric #include "llvm/Analysis/BasicAliasAnalysis.h"
790b57cec5SDimitry Andric #include "llvm/Analysis/BlockFrequencyInfo.h"
800b57cec5SDimitry Andric #include "llvm/Analysis/CFG.h"
810b57cec5SDimitry Andric #include "llvm/Analysis/CodeMetrics.h"
820b57cec5SDimitry Andric #include "llvm/Analysis/DemandedBits.h"
830b57cec5SDimitry Andric #include "llvm/Analysis/GlobalsModRef.h"
840b57cec5SDimitry Andric #include "llvm/Analysis/LoopAccessAnalysis.h"
850b57cec5SDimitry Andric #include "llvm/Analysis/LoopAnalysisManager.h"
860b57cec5SDimitry Andric #include "llvm/Analysis/LoopInfo.h"
870b57cec5SDimitry Andric #include "llvm/Analysis/LoopIterator.h"
880b57cec5SDimitry Andric #include "llvm/Analysis/OptimizationRemarkEmitter.h"
890b57cec5SDimitry Andric #include "llvm/Analysis/ProfileSummaryInfo.h"
900b57cec5SDimitry Andric #include "llvm/Analysis/ScalarEvolution.h"
910b57cec5SDimitry Andric #include "llvm/Analysis/ScalarEvolutionExpressions.h"
920b57cec5SDimitry Andric #include "llvm/Analysis/TargetLibraryInfo.h"
930b57cec5SDimitry Andric #include "llvm/Analysis/TargetTransformInfo.h"
94fcaf7f86SDimitry Andric #include "llvm/Analysis/ValueTracking.h"
950b57cec5SDimitry Andric #include "llvm/Analysis/VectorUtils.h"
960b57cec5SDimitry Andric #include "llvm/IR/Attributes.h"
970b57cec5SDimitry Andric #include "llvm/IR/BasicBlock.h"
980b57cec5SDimitry Andric #include "llvm/IR/CFG.h"
990b57cec5SDimitry Andric #include "llvm/IR/Constant.h"
1000b57cec5SDimitry Andric #include "llvm/IR/Constants.h"
1010b57cec5SDimitry Andric #include "llvm/IR/DataLayout.h"
102fe013be4SDimitry Andric #include "llvm/IR/DebugInfo.h"
1030b57cec5SDimitry Andric #include "llvm/IR/DebugInfoMetadata.h"
1040b57cec5SDimitry Andric #include "llvm/IR/DebugLoc.h"
1050b57cec5SDimitry Andric #include "llvm/IR/DerivedTypes.h"
1060b57cec5SDimitry Andric #include "llvm/IR/DiagnosticInfo.h"
1070b57cec5SDimitry Andric #include "llvm/IR/Dominators.h"
1080b57cec5SDimitry Andric #include "llvm/IR/Function.h"
1090b57cec5SDimitry Andric #include "llvm/IR/IRBuilder.h"
1100b57cec5SDimitry Andric #include "llvm/IR/InstrTypes.h"
1110b57cec5SDimitry Andric #include "llvm/IR/Instruction.h"
1120b57cec5SDimitry Andric #include "llvm/IR/Instructions.h"
1130b57cec5SDimitry Andric #include "llvm/IR/IntrinsicInst.h"
1140b57cec5SDimitry Andric #include "llvm/IR/Intrinsics.h"
115c9157d92SDimitry Andric #include "llvm/IR/MDBuilder.h"
1160b57cec5SDimitry Andric #include "llvm/IR/Metadata.h"
1170b57cec5SDimitry Andric #include "llvm/IR/Module.h"
1180b57cec5SDimitry Andric #include "llvm/IR/Operator.h"
119fe6060f1SDimitry Andric #include "llvm/IR/PatternMatch.h"
120c9157d92SDimitry Andric #include "llvm/IR/ProfDataUtils.h"
1210b57cec5SDimitry Andric #include "llvm/IR/Type.h"
1220b57cec5SDimitry Andric #include "llvm/IR/Use.h"
1230b57cec5SDimitry Andric #include "llvm/IR/User.h"
1240b57cec5SDimitry Andric #include "llvm/IR/Value.h"
1250b57cec5SDimitry Andric #include "llvm/IR/ValueHandle.h"
1260b57cec5SDimitry Andric #include "llvm/IR/Verifier.h"
1270b57cec5SDimitry Andric #include "llvm/Support/Casting.h"
1280b57cec5SDimitry Andric #include "llvm/Support/CommandLine.h"
1290b57cec5SDimitry Andric #include "llvm/Support/Compiler.h"
1300b57cec5SDimitry Andric #include "llvm/Support/Debug.h"
1310b57cec5SDimitry Andric #include "llvm/Support/ErrorHandling.h"
132e8d8bef9SDimitry Andric #include "llvm/Support/InstructionCost.h"
1330b57cec5SDimitry Andric #include "llvm/Support/MathExtras.h"
1340b57cec5SDimitry Andric #include "llvm/Support/raw_ostream.h"
1350b57cec5SDimitry Andric #include "llvm/Transforms/Utils/BasicBlockUtils.h"
1365ffd83dbSDimitry Andric #include "llvm/Transforms/Utils/InjectTLIMappings.h"
1370b57cec5SDimitry Andric #include "llvm/Transforms/Utils/LoopSimplify.h"
1380b57cec5SDimitry Andric #include "llvm/Transforms/Utils/LoopUtils.h"
1390b57cec5SDimitry Andric #include "llvm/Transforms/Utils/LoopVersioning.h"
1405ffd83dbSDimitry Andric #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
1410b57cec5SDimitry Andric #include "llvm/Transforms/Utils/SizeOpts.h"
1420b57cec5SDimitry Andric #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
1430b57cec5SDimitry Andric #include <algorithm>
1440b57cec5SDimitry Andric #include <cassert>
145bdd1243dSDimitry Andric #include <cmath>
1460b57cec5SDimitry Andric #include <cstdint>
1470b57cec5SDimitry Andric #include <functional>
1480b57cec5SDimitry Andric #include <iterator>
1490b57cec5SDimitry Andric #include <limits>
15081ad6265SDimitry Andric #include <map>
1510b57cec5SDimitry Andric #include <memory>
1520b57cec5SDimitry Andric #include <string>
1530b57cec5SDimitry Andric #include <tuple>
1540b57cec5SDimitry Andric #include <utility>
1550b57cec5SDimitry Andric 
1560b57cec5SDimitry Andric using namespace llvm;
1570b57cec5SDimitry Andric 
1580b57cec5SDimitry Andric #define LV_NAME "loop-vectorize"
1590b57cec5SDimitry Andric #define DEBUG_TYPE LV_NAME
1600b57cec5SDimitry Andric 
161e8d8bef9SDimitry Andric #ifndef NDEBUG
162e8d8bef9SDimitry Andric const char VerboseDebug[] = DEBUG_TYPE "-verbose";
163e8d8bef9SDimitry Andric #endif
164e8d8bef9SDimitry Andric 
1650b57cec5SDimitry Andric /// @{
1660b57cec5SDimitry Andric /// Metadata attribute names
167e8d8bef9SDimitry Andric const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
168e8d8bef9SDimitry Andric const char LLVMLoopVectorizeFollowupVectorized[] =
1690b57cec5SDimitry Andric     "llvm.loop.vectorize.followup_vectorized";
170e8d8bef9SDimitry Andric const char LLVMLoopVectorizeFollowupEpilogue[] =
1710b57cec5SDimitry Andric     "llvm.loop.vectorize.followup_epilogue";
1720b57cec5SDimitry Andric /// @}
1730b57cec5SDimitry Andric 
1740b57cec5SDimitry Andric STATISTIC(LoopsVectorized, "Number of loops vectorized");
1750b57cec5SDimitry Andric STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
176e8d8bef9SDimitry Andric STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
177e8d8bef9SDimitry Andric 
178e8d8bef9SDimitry Andric static cl::opt<bool> EnableEpilogueVectorization(
179e8d8bef9SDimitry Andric     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
180e8d8bef9SDimitry Andric     cl::desc("Enable vectorization of epilogue loops."));
181e8d8bef9SDimitry Andric 
182e8d8bef9SDimitry Andric static cl::opt<unsigned> EpilogueVectorizationForceVF(
183e8d8bef9SDimitry Andric     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
184e8d8bef9SDimitry Andric     cl::desc("When epilogue vectorization is enabled, and a value greater than "
185e8d8bef9SDimitry Andric              "1 is specified, forces the given VF for all applicable epilogue "
186e8d8bef9SDimitry Andric              "loops."));
187e8d8bef9SDimitry Andric 
188e8d8bef9SDimitry Andric static cl::opt<unsigned> EpilogueVectorizationMinVF(
189e8d8bef9SDimitry Andric     "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
190e8d8bef9SDimitry Andric     cl::desc("Only loops with vectorization factor equal to or larger than "
191e8d8bef9SDimitry Andric              "the specified value are considered for epilogue vectorization."));
1920b57cec5SDimitry Andric 
1930b57cec5SDimitry Andric /// Loops with a known constant trip count below this number are vectorized only
1940b57cec5SDimitry Andric /// if no scalar iteration overheads are incurred.
1950b57cec5SDimitry Andric static cl::opt<unsigned> TinyTripCountVectorThreshold(
1960b57cec5SDimitry Andric     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
1970b57cec5SDimitry Andric     cl::desc("Loops with a constant trip count that is smaller than this "
1980b57cec5SDimitry Andric              "value are vectorized only if no scalar iteration overheads "
1990b57cec5SDimitry Andric              "are incurred."));
2000b57cec5SDimitry Andric 
201753f127fSDimitry Andric static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
202753f127fSDimitry Andric     "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
203753f127fSDimitry Andric     cl::desc("The maximum allowed number of runtime memory checks"));
204fe6060f1SDimitry Andric 
205e8d8bef9SDimitry Andric // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
206e8d8bef9SDimitry Andric // that predication is preferred, and this lists all options. I.e., the
207e8d8bef9SDimitry Andric // vectorizer will try to fold the tail-loop (epilogue) into the vector body
208e8d8bef9SDimitry Andric // and predicate the instructions accordingly. If tail-folding fails, there are
209e8d8bef9SDimitry Andric // different fallback strategies depending on these values:
210e8d8bef9SDimitry Andric namespace PreferPredicateTy {
211e8d8bef9SDimitry Andric   enum Option {
212e8d8bef9SDimitry Andric     ScalarEpilogue = 0,
213e8d8bef9SDimitry Andric     PredicateElseScalarEpilogue,
214e8d8bef9SDimitry Andric     PredicateOrDontVectorize
215e8d8bef9SDimitry Andric   };
216e8d8bef9SDimitry Andric } // namespace PreferPredicateTy
217e8d8bef9SDimitry Andric 
218e8d8bef9SDimitry Andric static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
219e8d8bef9SDimitry Andric     "prefer-predicate-over-epilogue",
220e8d8bef9SDimitry Andric     cl::init(PreferPredicateTy::ScalarEpilogue),
221e8d8bef9SDimitry Andric     cl::Hidden,
222e8d8bef9SDimitry Andric     cl::desc("Tail-folding and predication preferences over creating a scalar "
223e8d8bef9SDimitry Andric              "epilogue loop."),
224e8d8bef9SDimitry Andric     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
225e8d8bef9SDimitry Andric                          "scalar-epilogue",
226e8d8bef9SDimitry Andric                          "Don't tail-predicate loops, create scalar epilogue"),
227e8d8bef9SDimitry Andric               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
228e8d8bef9SDimitry Andric                          "predicate-else-scalar-epilogue",
229e8d8bef9SDimitry Andric                          "prefer tail-folding, create scalar epilogue if tail "
230e8d8bef9SDimitry Andric                          "folding fails."),
231e8d8bef9SDimitry Andric               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
232e8d8bef9SDimitry Andric                          "predicate-dont-vectorize",
233e8d8bef9SDimitry Andric                          "prefers tail-folding, don't attempt vectorization if "
234e8d8bef9SDimitry Andric                          "tail-folding fails.")));
2358bcb0991SDimitry Andric 
236fe013be4SDimitry Andric static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
237fe013be4SDimitry Andric     "force-tail-folding-style", cl::desc("Force the tail folding style"),
238fe013be4SDimitry Andric     cl::init(TailFoldingStyle::None),
239fe013be4SDimitry Andric     cl::values(
240fe013be4SDimitry Andric         clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
241fe013be4SDimitry Andric         clEnumValN(
242fe013be4SDimitry Andric             TailFoldingStyle::Data, "data",
243fe013be4SDimitry Andric             "Create lane mask for data only, using active.lane.mask intrinsic"),
244fe013be4SDimitry Andric         clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
245fe013be4SDimitry Andric                    "data-without-lane-mask",
246fe013be4SDimitry Andric                    "Create lane mask with compare/stepvector"),
247fe013be4SDimitry Andric         clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
248fe013be4SDimitry Andric                    "Create lane mask using active.lane.mask intrinsic, and use "
249fe013be4SDimitry Andric                    "it for both data and control flow"),
250fe013be4SDimitry Andric         clEnumValN(
251fe013be4SDimitry Andric             TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
252fe013be4SDimitry Andric             "data-and-control-without-rt-check",
253fe013be4SDimitry Andric             "Similar to data-and-control, but remove the runtime check")));
254fe013be4SDimitry Andric 
2550b57cec5SDimitry Andric static cl::opt<bool> MaximizeBandwidth(
2560b57cec5SDimitry Andric     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
2570b57cec5SDimitry Andric     cl::desc("Maximize bandwidth when selecting vectorization factor which "
2580b57cec5SDimitry Andric              "will be determined by the smallest type in loop."));
2590b57cec5SDimitry Andric 
2600b57cec5SDimitry Andric static cl::opt<bool> EnableInterleavedMemAccesses(
2610b57cec5SDimitry Andric     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
2620b57cec5SDimitry Andric     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
2630b57cec5SDimitry Andric 
2640b57cec5SDimitry Andric /// An interleave-group may need masking if it resides in a block that needs
2650b57cec5SDimitry Andric /// predication, or in order to mask away gaps.
2660b57cec5SDimitry Andric static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
2670b57cec5SDimitry Andric     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
2680b57cec5SDimitry Andric     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
2690b57cec5SDimitry Andric 
270480093f4SDimitry Andric static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
271480093f4SDimitry Andric     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
272480093f4SDimitry Andric     cl::desc("We don't interleave loops with a estimated constant trip count "
273480093f4SDimitry Andric              "below this number"));
2740b57cec5SDimitry Andric 
2750b57cec5SDimitry Andric static cl::opt<unsigned> ForceTargetNumScalarRegs(
2760b57cec5SDimitry Andric     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
2770b57cec5SDimitry Andric     cl::desc("A flag that overrides the target's number of scalar registers."));
2780b57cec5SDimitry Andric 
2790b57cec5SDimitry Andric static cl::opt<unsigned> ForceTargetNumVectorRegs(
2800b57cec5SDimitry Andric     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
2810b57cec5SDimitry Andric     cl::desc("A flag that overrides the target's number of vector registers."));
2820b57cec5SDimitry Andric 
2830b57cec5SDimitry Andric static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
2840b57cec5SDimitry Andric     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
2850b57cec5SDimitry Andric     cl::desc("A flag that overrides the target's max interleave factor for "
2860b57cec5SDimitry Andric              "scalar loops."));
2870b57cec5SDimitry Andric 
2880b57cec5SDimitry Andric static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
2890b57cec5SDimitry Andric     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
2900b57cec5SDimitry Andric     cl::desc("A flag that overrides the target's max interleave factor for "
2910b57cec5SDimitry Andric              "vectorized loops."));
2920b57cec5SDimitry Andric 
2930b57cec5SDimitry Andric static cl::opt<unsigned> ForceTargetInstructionCost(
2940b57cec5SDimitry Andric     "force-target-instruction-cost", cl::init(0), cl::Hidden,
2950b57cec5SDimitry Andric     cl::desc("A flag that overrides the target's expected cost for "
2960b57cec5SDimitry Andric              "an instruction to a single constant value. Mostly "
2970b57cec5SDimitry Andric              "useful for getting consistent testing."));
2980b57cec5SDimitry Andric 
299e8d8bef9SDimitry Andric static cl::opt<bool> ForceTargetSupportsScalableVectors(
300e8d8bef9SDimitry Andric     "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
301e8d8bef9SDimitry Andric     cl::desc(
302e8d8bef9SDimitry Andric         "Pretend that scalable vectors are supported, even if the target does "
303e8d8bef9SDimitry Andric         "not support them. This flag should only be used for testing."));
304e8d8bef9SDimitry Andric 
3050b57cec5SDimitry Andric static cl::opt<unsigned> SmallLoopCost(
3060b57cec5SDimitry Andric     "small-loop-cost", cl::init(20), cl::Hidden,
3070b57cec5SDimitry Andric     cl::desc(
3080b57cec5SDimitry Andric         "The cost of a loop that is considered 'small' by the interleaver."));
3090b57cec5SDimitry Andric 
3100b57cec5SDimitry Andric static cl::opt<bool> LoopVectorizeWithBlockFrequency(
3110b57cec5SDimitry Andric     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
3120b57cec5SDimitry Andric     cl::desc("Enable the use of the block frequency analysis to access PGO "
3130b57cec5SDimitry Andric              "heuristics minimizing code growth in cold regions and being more "
3140b57cec5SDimitry Andric              "aggressive in hot regions."));
3150b57cec5SDimitry Andric 
3160b57cec5SDimitry Andric // Runtime interleave loops for load/store throughput.
3170b57cec5SDimitry Andric static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
3180b57cec5SDimitry Andric     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
3190b57cec5SDimitry Andric     cl::desc(
3200b57cec5SDimitry Andric         "Enable runtime interleaving until load/store ports are saturated"));
3210b57cec5SDimitry Andric 
322e8d8bef9SDimitry Andric /// Interleave small loops with scalar reductions.
323e8d8bef9SDimitry Andric static cl::opt<bool> InterleaveSmallLoopScalarReduction(
324e8d8bef9SDimitry Andric     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
325e8d8bef9SDimitry Andric     cl::desc("Enable interleaving for loops with small iteration counts that "
326e8d8bef9SDimitry Andric              "contain scalar reductions to expose ILP."));
327e8d8bef9SDimitry Andric 
3280b57cec5SDimitry Andric /// The number of stores in a loop that are allowed to need predication.
3290b57cec5SDimitry Andric static cl::opt<unsigned> NumberOfStoresToPredicate(
3300b57cec5SDimitry Andric     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
3310b57cec5SDimitry Andric     cl::desc("Max number of stores to be predicated behind an if."));
3320b57cec5SDimitry Andric 
3330b57cec5SDimitry Andric static cl::opt<bool> EnableIndVarRegisterHeur(
3340b57cec5SDimitry Andric     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
3350b57cec5SDimitry Andric     cl::desc("Count the induction variable only once when interleaving"));
3360b57cec5SDimitry Andric 
3370b57cec5SDimitry Andric static cl::opt<bool> EnableCondStoresVectorization(
3380b57cec5SDimitry Andric     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
3390b57cec5SDimitry Andric     cl::desc("Enable if predication of stores during vectorization."));
3400b57cec5SDimitry Andric 
3410b57cec5SDimitry Andric static cl::opt<unsigned> MaxNestedScalarReductionIC(
3420b57cec5SDimitry Andric     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
3430b57cec5SDimitry Andric     cl::desc("The maximum interleave count to use when interleaving a scalar "
3440b57cec5SDimitry Andric              "reduction in a nested loop."));
3450b57cec5SDimitry Andric 
346e8d8bef9SDimitry Andric static cl::opt<bool>
347e8d8bef9SDimitry Andric     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
348e8d8bef9SDimitry Andric                            cl::Hidden,
349e8d8bef9SDimitry Andric                            cl::desc("Prefer in-loop vector reductions, "
350e8d8bef9SDimitry Andric                                     "overriding the targets preference."));
351e8d8bef9SDimitry Andric 
352349cc55cSDimitry Andric static cl::opt<bool> ForceOrderedReductions(
353349cc55cSDimitry Andric     "force-ordered-reductions", cl::init(false), cl::Hidden,
354fe6060f1SDimitry Andric     cl::desc("Enable the vectorisation of loops with in-order (strict) "
355fe6060f1SDimitry Andric              "FP reductions"));
356fe6060f1SDimitry Andric 
357e8d8bef9SDimitry Andric static cl::opt<bool> PreferPredicatedReductionSelect(
358e8d8bef9SDimitry Andric     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
359e8d8bef9SDimitry Andric     cl::desc(
360e8d8bef9SDimitry Andric         "Prefer predicating a reduction operation over an after loop select."));
361e8d8bef9SDimitry Andric 
362fe013be4SDimitry Andric namespace llvm {
3630b57cec5SDimitry Andric cl::opt<bool> EnableVPlanNativePath(
364fe013be4SDimitry Andric     "enable-vplan-native-path", cl::Hidden,
3650b57cec5SDimitry Andric     cl::desc("Enable VPlan-native vectorization path with "
3660b57cec5SDimitry Andric              "support for outer loop vectorization."));
367fe013be4SDimitry Andric }
3680b57cec5SDimitry Andric 
3690b57cec5SDimitry Andric // This flag enables the stress testing of the VPlan H-CFG construction in the
3700b57cec5SDimitry Andric // VPlan-native vectorization path. It must be used in conjuction with
3710b57cec5SDimitry Andric // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
3720b57cec5SDimitry Andric // verification of the H-CFGs built.
3730b57cec5SDimitry Andric static cl::opt<bool> VPlanBuildStressTest(
3740b57cec5SDimitry Andric     "vplan-build-stress-test", cl::init(false), cl::Hidden,
3750b57cec5SDimitry Andric     cl::desc(
3760b57cec5SDimitry Andric         "Build VPlan for every supported loop nest in the function and bail "
3770b57cec5SDimitry Andric         "out right after the build (stress test the VPlan H-CFG construction "
3780b57cec5SDimitry Andric         "in the VPlan-native vectorization path)."));
3790b57cec5SDimitry Andric 
3800b57cec5SDimitry Andric cl::opt<bool> llvm::EnableLoopInterleaving(
3810b57cec5SDimitry Andric     "interleave-loops", cl::init(true), cl::Hidden,
3820b57cec5SDimitry Andric     cl::desc("Enable loop interleaving in Loop vectorization passes"));
3830b57cec5SDimitry Andric cl::opt<bool> llvm::EnableLoopVectorization(
3840b57cec5SDimitry Andric     "vectorize-loops", cl::init(true), cl::Hidden,
3850b57cec5SDimitry Andric     cl::desc("Run the Loop vectorization passes"));
3860b57cec5SDimitry Andric 
387bdd1243dSDimitry Andric static cl::opt<bool> PrintVPlansInDotFormat(
388bdd1243dSDimitry Andric     "vplan-print-in-dot-format", cl::Hidden,
389fe6060f1SDimitry Andric     cl::desc("Use dot format instead of plain text when dumping VPlans"));
3900b57cec5SDimitry Andric 
391bdd1243dSDimitry Andric static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
392bdd1243dSDimitry Andric     "force-widen-divrem-via-safe-divisor", cl::Hidden,
393bdd1243dSDimitry Andric     cl::desc(
394bdd1243dSDimitry Andric         "Override cost based safe divisor widening for div/rem instructions"));
395bdd1243dSDimitry Andric 
396c9157d92SDimitry Andric static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
397c9157d92SDimitry Andric     "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
398c9157d92SDimitry Andric     cl::Hidden,
399c9157d92SDimitry Andric     cl::desc("Try wider VFs if they enable the use of vector variants"));
400c9157d92SDimitry Andric 
401c9157d92SDimitry Andric // Likelyhood of bypassing the vectorized loop because assumptions about SCEV
402c9157d92SDimitry Andric // variables not overflowing do not hold. See `emitSCEVChecks`.
403c9157d92SDimitry Andric static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
404c9157d92SDimitry Andric // Likelyhood of bypassing the vectorized loop because pointers overlap. See
405c9157d92SDimitry Andric // `emitMemRuntimeChecks`.
406c9157d92SDimitry Andric static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
407c9157d92SDimitry Andric // Likelyhood of bypassing the vectorized loop because there are zero trips left
408c9157d92SDimitry Andric // after prolog. See `emitIterationCountCheck`.
409c9157d92SDimitry Andric static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
410c9157d92SDimitry Andric 
4110b57cec5SDimitry Andric /// A helper function that returns true if the given type is irregular. The
4120b57cec5SDimitry Andric /// type is irregular if its allocated size doesn't equal the store size of an
413d409305fSDimitry Andric /// element of the corresponding vector type.
hasIrregularType(Type * Ty,const DataLayout & DL)414d409305fSDimitry Andric static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
415d409305fSDimitry Andric   // Determine if an array of N elements of type Ty is "bitcast compatible"
416d409305fSDimitry Andric   // with a <N x Ty> vector.
417d409305fSDimitry Andric   // This is only true if there is no padding between the array elements.
4180b57cec5SDimitry Andric   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
4190b57cec5SDimitry Andric }
4200b57cec5SDimitry Andric 
4210b57cec5SDimitry Andric /// A helper function that returns the reciprocal of the block probability of
4220b57cec5SDimitry Andric /// predicated blocks. If we return X, we are assuming the predicated block
4230b57cec5SDimitry Andric /// will execute once for every X iterations of the loop header.
4240b57cec5SDimitry Andric ///
4250b57cec5SDimitry Andric /// TODO: We should use actual block probability here, if available. Currently,
4260b57cec5SDimitry Andric ///       we always assume predicated blocks have a 50% chance of executing.
getReciprocalPredBlockProb()4270b57cec5SDimitry Andric static unsigned getReciprocalPredBlockProb() { return 2; }
4280b57cec5SDimitry Andric 
4298bcb0991SDimitry Andric /// Returns "best known" trip count for the specified loop \p L as defined by
4308bcb0991SDimitry Andric /// the following procedure:
4318bcb0991SDimitry Andric ///   1) Returns exact trip count if it is known.
4328bcb0991SDimitry Andric ///   2) Returns expected trip count according to profile data if any.
4338bcb0991SDimitry Andric ///   3) Returns upper bound estimate if it is known.
434bdd1243dSDimitry Andric ///   4) Returns std::nullopt if all of the above failed.
getSmallBestKnownTC(ScalarEvolution & SE,Loop * L)435bdd1243dSDimitry Andric static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE,
436bdd1243dSDimitry Andric                                                    Loop *L) {
4378bcb0991SDimitry Andric   // Check if exact trip count is known.
4388bcb0991SDimitry Andric   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
4398bcb0991SDimitry Andric     return ExpectedTC;
4408bcb0991SDimitry Andric 
4418bcb0991SDimitry Andric   // Check if there is an expected trip count available from profile data.
4428bcb0991SDimitry Andric   if (LoopVectorizeWithBlockFrequency)
4438bcb0991SDimitry Andric     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
444bdd1243dSDimitry Andric       return *EstimatedTC;
4458bcb0991SDimitry Andric 
4468bcb0991SDimitry Andric   // Check if upper bound estimate is known.
4478bcb0991SDimitry Andric   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
4488bcb0991SDimitry Andric     return ExpectedTC;
4498bcb0991SDimitry Andric 
450bdd1243dSDimitry Andric   return std::nullopt;
4518bcb0991SDimitry Andric }
4528bcb0991SDimitry Andric 
453fe013be4SDimitry Andric /// Return a vector containing interleaved elements from multiple
454fe013be4SDimitry Andric /// smaller input vectors.
interleaveVectors(IRBuilderBase & Builder,ArrayRef<Value * > Vals,const Twine & Name)455fe013be4SDimitry Andric static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
456fe013be4SDimitry Andric                                 const Twine &Name) {
457fe013be4SDimitry Andric   unsigned Factor = Vals.size();
458fe013be4SDimitry Andric   assert(Factor > 1 && "Tried to interleave invalid number of vectors");
459fe013be4SDimitry Andric 
460fe013be4SDimitry Andric   VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
461fe013be4SDimitry Andric #ifndef NDEBUG
462fe013be4SDimitry Andric   for (Value *Val : Vals)
463fe013be4SDimitry Andric     assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
464fe013be4SDimitry Andric #endif
465fe013be4SDimitry Andric 
466fe013be4SDimitry Andric   // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
467fe013be4SDimitry Andric   // must use intrinsics to interleave.
468fe013be4SDimitry Andric   if (VecTy->isScalableTy()) {
469fe013be4SDimitry Andric     VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
470fe013be4SDimitry Andric     return Builder.CreateIntrinsic(
471fe013be4SDimitry Andric         WideVecTy, Intrinsic::experimental_vector_interleave2, Vals,
472fe013be4SDimitry Andric         /*FMFSource=*/nullptr, Name);
473fe013be4SDimitry Andric   }
474fe013be4SDimitry Andric 
475fe013be4SDimitry Andric   // Fixed length. Start by concatenating all vectors into a wide vector.
476fe013be4SDimitry Andric   Value *WideVec = concatenateVectors(Builder, Vals);
477fe013be4SDimitry Andric 
478fe013be4SDimitry Andric   // Interleave the elements into the wide vector.
479fe013be4SDimitry Andric   const unsigned NumElts = VecTy->getElementCount().getFixedValue();
480fe013be4SDimitry Andric   return Builder.CreateShuffleVector(
481fe013be4SDimitry Andric       WideVec, createInterleaveMask(NumElts, Factor), Name);
482fe013be4SDimitry Andric }
483fe013be4SDimitry Andric 
484bdd1243dSDimitry Andric namespace {
485fe6060f1SDimitry Andric // Forward declare GeneratedRTChecks.
486fe6060f1SDimitry Andric class GeneratedRTChecks;
487fe013be4SDimitry Andric 
488fe013be4SDimitry Andric using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
489bdd1243dSDimitry Andric } // namespace
490fe6060f1SDimitry Andric 
4910b57cec5SDimitry Andric namespace llvm {
4920b57cec5SDimitry Andric 
4930eae32dcSDimitry Andric AnalysisKey ShouldRunExtraVectorPasses::Key;
4940eae32dcSDimitry Andric 
4950b57cec5SDimitry Andric /// InnerLoopVectorizer vectorizes loops which contain only one basic
4960b57cec5SDimitry Andric /// block to a specified vectorization factor (VF).
4970b57cec5SDimitry Andric /// This class performs the widening of scalars into vectors, or multiple
4980b57cec5SDimitry Andric /// scalars. This class also implements the following features:
4990b57cec5SDimitry Andric /// * It inserts an epilogue loop for handling loops that don't have iteration
5000b57cec5SDimitry Andric ///   counts that are known to be a multiple of the vectorization factor.
5010b57cec5SDimitry Andric /// * It handles the code generation for reduction variables.
5020b57cec5SDimitry Andric /// * Scalarization (implementation using scalars) of un-vectorizable
5030b57cec5SDimitry Andric ///   instructions.
5040b57cec5SDimitry Andric /// InnerLoopVectorizer does not perform any vectorization-legality
5050b57cec5SDimitry Andric /// checks, and relies on the caller to check for the different legality
5060b57cec5SDimitry Andric /// aspects. The InnerLoopVectorizer relies on the
5070b57cec5SDimitry Andric /// LoopVectorizationLegality class to provide information about the induction
5080b57cec5SDimitry Andric /// and reduction variables that were found to a given vectorization factor.
5090b57cec5SDimitry Andric class InnerLoopVectorizer {
5100b57cec5SDimitry Andric public:
InnerLoopVectorizer(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,ElementCount VecWidth,ElementCount MinProfitableTripCount,unsigned UnrollFactor,LoopVectorizationLegality * LVL,LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,GeneratedRTChecks & RTChecks)5110b57cec5SDimitry Andric   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
5120b57cec5SDimitry Andric                       LoopInfo *LI, DominatorTree *DT,
5130b57cec5SDimitry Andric                       const TargetLibraryInfo *TLI,
5140b57cec5SDimitry Andric                       const TargetTransformInfo *TTI, AssumptionCache *AC,
515e8d8bef9SDimitry Andric                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
516753f127fSDimitry Andric                       ElementCount MinProfitableTripCount,
5170b57cec5SDimitry Andric                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
518e8d8bef9SDimitry Andric                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
519fe6060f1SDimitry Andric                       ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
5200b57cec5SDimitry Andric       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
5210b57cec5SDimitry Andric         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
522fe6060f1SDimitry Andric         Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
523fe6060f1SDimitry Andric         PSI(PSI), RTChecks(RTChecks) {
524e8d8bef9SDimitry Andric     // Query this against the original loop and save it here because the profile
525e8d8bef9SDimitry Andric     // of the original loop header may change as the transformation happens.
526e8d8bef9SDimitry Andric     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
527e8d8bef9SDimitry Andric         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
528753f127fSDimitry Andric 
529753f127fSDimitry Andric     if (MinProfitableTripCount.isZero())
530753f127fSDimitry Andric       this->MinProfitableTripCount = VecWidth;
531753f127fSDimitry Andric     else
532753f127fSDimitry Andric       this->MinProfitableTripCount = MinProfitableTripCount;
533e8d8bef9SDimitry Andric   }
534e8d8bef9SDimitry Andric 
5350b57cec5SDimitry Andric   virtual ~InnerLoopVectorizer() = default;
5360b57cec5SDimitry Andric 
537e8d8bef9SDimitry Andric   /// Create a new empty loop that will contain vectorized instructions later
538e8d8bef9SDimitry Andric   /// on, while the old loop will be used as the scalar remainder. Control flow
539e8d8bef9SDimitry Andric   /// is generated around the vectorized (and scalar epilogue) loops consisting
540e8d8bef9SDimitry Andric   /// of various checks and bypasses. Return the pre-header block of the new
54104eeddc0SDimitry Andric   /// loop and the start value for the canonical induction, if it is != 0. The
54204eeddc0SDimitry Andric   /// latter is the case when vectorizing the epilogue loop. In the case of
54304eeddc0SDimitry Andric   /// epilogue vectorization, this function is overriden to handle the more
544fe013be4SDimitry Andric   /// complex control flow around the loops.  \p ExpandedSCEVs is used to
545fe013be4SDimitry Andric   /// look up SCEV expansions for expressions needed during skeleton creation.
546fe013be4SDimitry Andric   virtual std::pair<BasicBlock *, Value *>
547fe013be4SDimitry Andric   createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
5480b57cec5SDimitry Andric 
5490b57cec5SDimitry Andric   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
55081ad6265SDimitry Andric   void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
5510b57cec5SDimitry Andric 
5520b57cec5SDimitry Andric   // Return true if any runtime check is added.
areSafetyChecksAdded()5530b57cec5SDimitry Andric   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
5540b57cec5SDimitry Andric 
5550b57cec5SDimitry Andric   /// A type for vectorized values in the new loop. Each value from the
5560b57cec5SDimitry Andric   /// original loop, when vectorized, is represented by UF vector values in the
5570b57cec5SDimitry Andric   /// new unrolled loop, where UF is the unroll factor.
5580b57cec5SDimitry Andric   using VectorParts = SmallVector<Value *, 2>;
5590b57cec5SDimitry Andric 
5600b57cec5SDimitry Andric   /// A helper function to scalarize a single Instruction in the innermost loop.
5610b57cec5SDimitry Andric   /// Generates a sequence of scalar instances for each lane between \p MinLane
5620b57cec5SDimitry Andric   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
5634824e7fdSDimitry Andric   /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
5645ffd83dbSDimitry Andric   /// Instr's operands.
565bdd1243dSDimitry Andric   void scalarizeInstruction(const Instruction *Instr,
566bdd1243dSDimitry Andric                             VPReplicateRecipe *RepRecipe,
567fe013be4SDimitry Andric                             const VPIteration &Instance,
5685ffd83dbSDimitry Andric                             VPTransformState &State);
5690b57cec5SDimitry Andric 
5705ffd83dbSDimitry Andric   /// Try to vectorize interleaved access group \p Group with the base address
5715ffd83dbSDimitry Andric   /// given in \p Addr, optionally masking the vector operations if \p
5725ffd83dbSDimitry Andric   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
5735ffd83dbSDimitry Andric   /// values in the vectorized loop.
5745ffd83dbSDimitry Andric   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
575e8d8bef9SDimitry Andric                                 ArrayRef<VPValue *> VPDefs,
5765ffd83dbSDimitry Andric                                 VPTransformState &State, VPValue *Addr,
577e8d8bef9SDimitry Andric                                 ArrayRef<VPValue *> StoredValues,
578fe013be4SDimitry Andric                                 VPValue *BlockInMask, bool NeedsMaskForGaps);
5790b57cec5SDimitry Andric 
58081ad6265SDimitry Andric   /// Fix the non-induction PHIs in \p Plan.
58181ad6265SDimitry Andric   void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);
582fe6060f1SDimitry Andric 
583fe6060f1SDimitry Andric   /// Returns true if the reordering of FP operations is not allowed, but we are
584fe6060f1SDimitry Andric   /// able to vectorize with strict in-order reductions for the given RdxDesc.
5850eae32dcSDimitry Andric   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
586fe6060f1SDimitry Andric 
587bdd1243dSDimitry Andric   /// Create a new phi node for the induction variable \p OrigPhi to resume
588bdd1243dSDimitry Andric   /// iteration count in the scalar epilogue, from where the vectorized loop
589fe013be4SDimitry Andric   /// left off. \p Step is the SCEV-expanded induction step to use. In cases
590fe013be4SDimitry Andric   /// where the loop skeleton is more complicated (i.e., epilogue vectorization)
591fe013be4SDimitry Andric   /// and the resume values can come from an additional bypass block, the \p
592fe013be4SDimitry Andric   /// AdditionalBypass pair provides information about the bypass block and the
593fe013be4SDimitry Andric   /// end value on the edge from bypass to this loop.
594bdd1243dSDimitry Andric   PHINode *createInductionResumeValue(
595fe013be4SDimitry Andric       PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step,
596bdd1243dSDimitry Andric       ArrayRef<BasicBlock *> BypassBlocks,
597bdd1243dSDimitry Andric       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
598bdd1243dSDimitry Andric 
599fe013be4SDimitry Andric   /// Returns the original loop trip count.
getTripCount() const600fe013be4SDimitry Andric   Value *getTripCount() const { return TripCount; }
601fe013be4SDimitry Andric 
602fe013be4SDimitry Andric   /// Used to set the trip count after ILV's construction and after the
603fe013be4SDimitry Andric   /// preheader block has been executed. Note that this always holds the trip
604fe013be4SDimitry Andric   /// count of the original loop for both main loop and epilogue vectorization.
setTripCount(Value * TC)605fe013be4SDimitry Andric   void setTripCount(Value *TC) { TripCount = TC; }
606fe013be4SDimitry Andric 
6070b57cec5SDimitry Andric protected:
6080b57cec5SDimitry Andric   friend class LoopVectorizationPlanner;
6090b57cec5SDimitry Andric 
6100b57cec5SDimitry Andric   /// A small list of PHINodes.
6110b57cec5SDimitry Andric   using PhiVector = SmallVector<PHINode *, 4>;
6120b57cec5SDimitry Andric 
6130b57cec5SDimitry Andric   /// A type for scalarized values in the new loop. Each value from the
6140b57cec5SDimitry Andric   /// original loop, when scalarized, is represented by UF x VF scalar values
6150b57cec5SDimitry Andric   /// in the new unrolled loop, where UF is the unroll factor and VF is the
6160b57cec5SDimitry Andric   /// vectorization factor.
6170b57cec5SDimitry Andric   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
6180b57cec5SDimitry Andric 
6190b57cec5SDimitry Andric   /// Set up the values of the IVs correctly when exiting the vector loop.
6200b57cec5SDimitry Andric   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
62181ad6265SDimitry Andric                     Value *VectorTripCount, Value *EndValue,
62281ad6265SDimitry Andric                     BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
623fe013be4SDimitry Andric                     VPlan &Plan, VPTransformState &State);
6240b57cec5SDimitry Andric 
625349cc55cSDimitry Andric   /// Create the exit value of first order recurrences in the middle block and
626349cc55cSDimitry Andric   /// update their users.
627bdd1243dSDimitry Andric   void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
62804eeddc0SDimitry Andric                                VPTransformState &State);
6290b57cec5SDimitry Andric 
630349cc55cSDimitry Andric   /// Create code for the loop exit value of the reduction.
631fe6060f1SDimitry Andric   void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
6320b57cec5SDimitry Andric 
6330b57cec5SDimitry Andric   /// Iteratively sink the scalarized operands of a predicated instruction into
6340b57cec5SDimitry Andric   /// the block that was created for it.
6350b57cec5SDimitry Andric   void sinkScalarOperands(Instruction *PredInst);
6360b57cec5SDimitry Andric 
6370b57cec5SDimitry Andric   /// Returns (and creates if needed) the trip count of the widened loop.
63881ad6265SDimitry Andric   Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
6390b57cec5SDimitry Andric 
6400b57cec5SDimitry Andric   /// Returns a bitcasted value to the requested vector type.
6410b57cec5SDimitry Andric   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
6420b57cec5SDimitry Andric   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
6430b57cec5SDimitry Andric                                 const DataLayout &DL);
6440b57cec5SDimitry Andric 
6450b57cec5SDimitry Andric   /// Emit a bypass check to see if the vector trip count is zero, including if
6460b57cec5SDimitry Andric   /// it overflows.
64781ad6265SDimitry Andric   void emitIterationCountCheck(BasicBlock *Bypass);
6480b57cec5SDimitry Andric 
6490b57cec5SDimitry Andric   /// Emit a bypass check to see if all of the SCEV assumptions we've
650fe6060f1SDimitry Andric   /// had to make are correct. Returns the block containing the checks or
651fe6060f1SDimitry Andric   /// nullptr if no checks have been added.
65281ad6265SDimitry Andric   BasicBlock *emitSCEVChecks(BasicBlock *Bypass);
6530b57cec5SDimitry Andric 
6540b57cec5SDimitry Andric   /// Emit bypass checks to check any memory assumptions we may have made.
655fe6060f1SDimitry Andric   /// Returns the block containing the checks or nullptr if no checks have been
656fe6060f1SDimitry Andric   /// added.
65781ad6265SDimitry Andric   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass);
6580b57cec5SDimitry Andric 
659e8d8bef9SDimitry Andric   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
66081ad6265SDimitry Andric   /// vector loop preheader, middle block and scalar preheader.
66181ad6265SDimitry Andric   void createVectorLoopSkeleton(StringRef Prefix);
662e8d8bef9SDimitry Andric 
663e8d8bef9SDimitry Andric   /// Create new phi nodes for the induction variables to resume iteration count
66404eeddc0SDimitry Andric   /// in the scalar epilogue, from where the vectorized loop left off.
665e8d8bef9SDimitry Andric   /// In cases where the loop skeleton is more complicated (eg. epilogue
666e8d8bef9SDimitry Andric   /// vectorization) and the resume values can come from an additional bypass
667e8d8bef9SDimitry Andric   /// block, the \p AdditionalBypass pair provides information about the bypass
668e8d8bef9SDimitry Andric   /// block and the end value on the edge from bypass to this loop.
669e8d8bef9SDimitry Andric   void createInductionResumeValues(
670fe013be4SDimitry Andric       const SCEV2ValueTy &ExpandedSCEVs,
671e8d8bef9SDimitry Andric       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
672e8d8bef9SDimitry Andric 
673e8d8bef9SDimitry Andric   /// Complete the loop skeleton by adding debug MDs, creating appropriate
674e8d8bef9SDimitry Andric   /// conditional branches in the middle block, preparing the builder and
67581ad6265SDimitry Andric   /// running the verifier. Return the preheader of the completed vector loop.
676bdd1243dSDimitry Andric   BasicBlock *completeLoopSkeleton();
6770b57cec5SDimitry Andric 
6784824e7fdSDimitry Andric   /// Collect poison-generating recipes that may generate a poison value that is
6794824e7fdSDimitry Andric   /// used after vectorization, even when their operands are not poison. Those
6804824e7fdSDimitry Andric   /// recipes meet the following conditions:
6814824e7fdSDimitry Andric   ///  * Contribute to the address computation of a recipe generating a widen
6824824e7fdSDimitry Andric   ///    memory load/store (VPWidenMemoryInstructionRecipe or
6834824e7fdSDimitry Andric   ///    VPInterleaveRecipe).
6844824e7fdSDimitry Andric   ///  * Such a widen memory load/store has at least one underlying Instruction
6854824e7fdSDimitry Andric   ///    that is in a basic block that needs predication and after vectorization
6864824e7fdSDimitry Andric   ///    the generated instruction won't be predicated.
6874824e7fdSDimitry Andric   void collectPoisonGeneratingRecipes(VPTransformState &State);
6880b57cec5SDimitry Andric 
689e8d8bef9SDimitry Andric   /// Allow subclasses to override and print debug traces before/after vplan
690e8d8bef9SDimitry Andric   /// execution, when trace information is requested.
printDebugTracesAtStart()691e8d8bef9SDimitry Andric   virtual void printDebugTracesAtStart(){};
printDebugTracesAtEnd()692e8d8bef9SDimitry Andric   virtual void printDebugTracesAtEnd(){};
693e8d8bef9SDimitry Andric 
6940b57cec5SDimitry Andric   /// The original loop.
6950b57cec5SDimitry Andric   Loop *OrigLoop;
6960b57cec5SDimitry Andric 
6970b57cec5SDimitry Andric   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
6980b57cec5SDimitry Andric   /// dynamic knowledge to simplify SCEV expressions and converts them to a
6990b57cec5SDimitry Andric   /// more usable form.
7000b57cec5SDimitry Andric   PredicatedScalarEvolution &PSE;
7010b57cec5SDimitry Andric 
7020b57cec5SDimitry Andric   /// Loop Info.
7030b57cec5SDimitry Andric   LoopInfo *LI;
7040b57cec5SDimitry Andric 
7050b57cec5SDimitry Andric   /// Dominator Tree.
7060b57cec5SDimitry Andric   DominatorTree *DT;
7070b57cec5SDimitry Andric 
7080b57cec5SDimitry Andric   /// Target Library Info.
7090b57cec5SDimitry Andric   const TargetLibraryInfo *TLI;
7100b57cec5SDimitry Andric 
7110b57cec5SDimitry Andric   /// Target Transform Info.
7120b57cec5SDimitry Andric   const TargetTransformInfo *TTI;
7130b57cec5SDimitry Andric 
7140b57cec5SDimitry Andric   /// Assumption Cache.
7150b57cec5SDimitry Andric   AssumptionCache *AC;
7160b57cec5SDimitry Andric 
7170b57cec5SDimitry Andric   /// Interface to emit optimization remarks.
7180b57cec5SDimitry Andric   OptimizationRemarkEmitter *ORE;
7190b57cec5SDimitry Andric 
7200b57cec5SDimitry Andric   /// The vectorization SIMD factor to use. Each vector will have this many
7210b57cec5SDimitry Andric   /// vector elements.
722e8d8bef9SDimitry Andric   ElementCount VF;
7230b57cec5SDimitry Andric 
724753f127fSDimitry Andric   ElementCount MinProfitableTripCount;
725753f127fSDimitry Andric 
7260b57cec5SDimitry Andric   /// The vectorization unroll factor to use. Each scalar is vectorized to this
7270b57cec5SDimitry Andric   /// many different vector instructions.
7280b57cec5SDimitry Andric   unsigned UF;
7290b57cec5SDimitry Andric 
7300b57cec5SDimitry Andric   /// The builder that we use
7310b57cec5SDimitry Andric   IRBuilder<> Builder;
7320b57cec5SDimitry Andric 
7330b57cec5SDimitry Andric   // --- Vectorization state ---
7340b57cec5SDimitry Andric 
7350b57cec5SDimitry Andric   /// The vector-loop preheader.
7360b57cec5SDimitry Andric   BasicBlock *LoopVectorPreHeader;
7370b57cec5SDimitry Andric 
7380b57cec5SDimitry Andric   /// The scalar-loop preheader.
7390b57cec5SDimitry Andric   BasicBlock *LoopScalarPreHeader;
7400b57cec5SDimitry Andric 
7410b57cec5SDimitry Andric   /// Middle Block between the vector and the scalar.
7420b57cec5SDimitry Andric   BasicBlock *LoopMiddleBlock;
7430b57cec5SDimitry Andric 
744fe6060f1SDimitry Andric   /// The unique ExitBlock of the scalar loop if one exists.  Note that
745e8d8bef9SDimitry Andric   /// there can be multiple exiting edges reaching this block.
7460b57cec5SDimitry Andric   BasicBlock *LoopExitBlock;
7470b57cec5SDimitry Andric 
7480b57cec5SDimitry Andric   /// The scalar loop body.
7490b57cec5SDimitry Andric   BasicBlock *LoopScalarBody;
7500b57cec5SDimitry Andric 
7510b57cec5SDimitry Andric   /// A list of all bypass blocks. The first block is the entry of the loop.
7520b57cec5SDimitry Andric   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
7530b57cec5SDimitry Andric 
7540b57cec5SDimitry Andric   /// Store instructions that were predicated.
7550b57cec5SDimitry Andric   SmallVector<Instruction *, 4> PredicatedInstructions;
7560b57cec5SDimitry Andric 
7570b57cec5SDimitry Andric   /// Trip count of the original loop.
7580b57cec5SDimitry Andric   Value *TripCount = nullptr;
7590b57cec5SDimitry Andric 
7600b57cec5SDimitry Andric   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
7610b57cec5SDimitry Andric   Value *VectorTripCount = nullptr;
7620b57cec5SDimitry Andric 
7630b57cec5SDimitry Andric   /// The legality analysis.
7640b57cec5SDimitry Andric   LoopVectorizationLegality *Legal;
7650b57cec5SDimitry Andric 
7660b57cec5SDimitry Andric   /// The profitablity analysis.
7670b57cec5SDimitry Andric   LoopVectorizationCostModel *Cost;
7680b57cec5SDimitry Andric 
7690b57cec5SDimitry Andric   // Record whether runtime checks are added.
7700b57cec5SDimitry Andric   bool AddedSafetyChecks = false;
7710b57cec5SDimitry Andric 
7720b57cec5SDimitry Andric   // Holds the end values for each induction variable. We save the end values
7730b57cec5SDimitry Andric   // so we can later fix-up the external users of the induction variables.
7740b57cec5SDimitry Andric   DenseMap<PHINode *, Value *> IVEndValues;
7750b57cec5SDimitry Andric 
776e8d8bef9SDimitry Andric   /// BFI and PSI are used to check for profile guided size optimizations.
777e8d8bef9SDimitry Andric   BlockFrequencyInfo *BFI;
778e8d8bef9SDimitry Andric   ProfileSummaryInfo *PSI;
779e8d8bef9SDimitry Andric 
780e8d8bef9SDimitry Andric   // Whether this loop should be optimized for size based on profile guided size
781e8d8bef9SDimitry Andric   // optimizatios.
782e8d8bef9SDimitry Andric   bool OptForSizeBasedOnProfile;
783fe6060f1SDimitry Andric 
784fe6060f1SDimitry Andric   /// Structure to hold information about generated runtime checks, responsible
785fe6060f1SDimitry Andric   /// for cleaning the checks, if vectorization turns out unprofitable.
786fe6060f1SDimitry Andric   GeneratedRTChecks &RTChecks;
78704eeddc0SDimitry Andric 
78804eeddc0SDimitry Andric   // Holds the resume values for reductions in the loops, used to set the
78904eeddc0SDimitry Andric   // correct start value of reduction PHIs when vectorizing the epilogue.
79004eeddc0SDimitry Andric   SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4>
79104eeddc0SDimitry Andric       ReductionResumeValues;
7920b57cec5SDimitry Andric };
7930b57cec5SDimitry Andric 
7940b57cec5SDimitry Andric class InnerLoopUnroller : public InnerLoopVectorizer {
7950b57cec5SDimitry Andric public:
InnerLoopUnroller(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,unsigned UnrollFactor,LoopVectorizationLegality * LVL,LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,GeneratedRTChecks & Check)7960b57cec5SDimitry Andric   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
7970b57cec5SDimitry Andric                     LoopInfo *LI, DominatorTree *DT,
7980b57cec5SDimitry Andric                     const TargetLibraryInfo *TLI,
7990b57cec5SDimitry Andric                     const TargetTransformInfo *TTI, AssumptionCache *AC,
8000b57cec5SDimitry Andric                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
8010b57cec5SDimitry Andric                     LoopVectorizationLegality *LVL,
802e8d8bef9SDimitry Andric                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
803fe6060f1SDimitry Andric                     ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
804e8d8bef9SDimitry Andric       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
805753f127fSDimitry Andric                             ElementCount::getFixed(1),
806e8d8bef9SDimitry Andric                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
807fe6060f1SDimitry Andric                             BFI, PSI, Check) {}
8080b57cec5SDimitry Andric };
8090b57cec5SDimitry Andric 
810e8d8bef9SDimitry Andric /// Encapsulate information regarding vectorization of a loop and its epilogue.
811e8d8bef9SDimitry Andric /// This information is meant to be updated and used across two stages of
812e8d8bef9SDimitry Andric /// epilogue vectorization.
813e8d8bef9SDimitry Andric struct EpilogueLoopVectorizationInfo {
814e8d8bef9SDimitry Andric   ElementCount MainLoopVF = ElementCount::getFixed(0);
815e8d8bef9SDimitry Andric   unsigned MainLoopUF = 0;
816e8d8bef9SDimitry Andric   ElementCount EpilogueVF = ElementCount::getFixed(0);
817e8d8bef9SDimitry Andric   unsigned EpilogueUF = 0;
818e8d8bef9SDimitry Andric   BasicBlock *MainLoopIterationCountCheck = nullptr;
819e8d8bef9SDimitry Andric   BasicBlock *EpilogueIterationCountCheck = nullptr;
820e8d8bef9SDimitry Andric   BasicBlock *SCEVSafetyCheck = nullptr;
821e8d8bef9SDimitry Andric   BasicBlock *MemSafetyCheck = nullptr;
822e8d8bef9SDimitry Andric   Value *TripCount = nullptr;
823e8d8bef9SDimitry Andric   Value *VectorTripCount = nullptr;
824e8d8bef9SDimitry Andric 
EpilogueLoopVectorizationInfollvm::EpilogueLoopVectorizationInfo825349cc55cSDimitry Andric   EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
826349cc55cSDimitry Andric                                 ElementCount EVF, unsigned EUF)
827349cc55cSDimitry Andric       : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
828e8d8bef9SDimitry Andric     assert(EUF == 1 &&
829e8d8bef9SDimitry Andric            "A high UF for the epilogue loop is likely not beneficial.");
830e8d8bef9SDimitry Andric   }
831e8d8bef9SDimitry Andric };
832e8d8bef9SDimitry Andric 
833e8d8bef9SDimitry Andric /// An extension of the inner loop vectorizer that creates a skeleton for a
834e8d8bef9SDimitry Andric /// vectorized loop that has its epilogue (residual) also vectorized.
835e8d8bef9SDimitry Andric /// The idea is to run the vplan on a given loop twice, firstly to setup the
836e8d8bef9SDimitry Andric /// skeleton and vectorize the main loop, and secondly to complete the skeleton
837e8d8bef9SDimitry Andric /// from the first step and vectorize the epilogue.  This is achieved by
838e8d8bef9SDimitry Andric /// deriving two concrete strategy classes from this base class and invoking
839e8d8bef9SDimitry Andric /// them in succession from the loop vectorizer planner.
840e8d8bef9SDimitry Andric class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
841e8d8bef9SDimitry Andric public:
InnerLoopAndEpilogueVectorizer(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,EpilogueLoopVectorizationInfo & EPI,LoopVectorizationLegality * LVL,llvm::LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,GeneratedRTChecks & Checks)842e8d8bef9SDimitry Andric   InnerLoopAndEpilogueVectorizer(
843e8d8bef9SDimitry Andric       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
844e8d8bef9SDimitry Andric       DominatorTree *DT, const TargetLibraryInfo *TLI,
845e8d8bef9SDimitry Andric       const TargetTransformInfo *TTI, AssumptionCache *AC,
846e8d8bef9SDimitry Andric       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
847e8d8bef9SDimitry Andric       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
848fe6060f1SDimitry Andric       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
849fe6060f1SDimitry Andric       GeneratedRTChecks &Checks)
850e8d8bef9SDimitry Andric       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
851753f127fSDimitry Andric                             EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
852753f127fSDimitry Andric                             CM, BFI, PSI, Checks),
853e8d8bef9SDimitry Andric         EPI(EPI) {}
854e8d8bef9SDimitry Andric 
855e8d8bef9SDimitry Andric   // Override this function to handle the more complex control flow around the
856e8d8bef9SDimitry Andric   // three loops.
createVectorizedLoopSkeleton(const SCEV2ValueTy & ExpandedSCEVs)857fe013be4SDimitry Andric   std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(
858fe013be4SDimitry Andric       const SCEV2ValueTy &ExpandedSCEVs) final {
859fe013be4SDimitry Andric     return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs);
860e8d8bef9SDimitry Andric   }
861e8d8bef9SDimitry Andric 
862e8d8bef9SDimitry Andric   /// The interface for creating a vectorized skeleton using one of two
863e8d8bef9SDimitry Andric   /// different strategies, each corresponding to one execution of the vplan
864e8d8bef9SDimitry Andric   /// as described above.
86504eeddc0SDimitry Andric   virtual std::pair<BasicBlock *, Value *>
866fe013be4SDimitry Andric   createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0;
867e8d8bef9SDimitry Andric 
868e8d8bef9SDimitry Andric   /// Holds and updates state information required to vectorize the main loop
869e8d8bef9SDimitry Andric   /// and its epilogue in two separate passes. This setup helps us avoid
870e8d8bef9SDimitry Andric   /// regenerating and recomputing runtime safety checks. It also helps us to
871e8d8bef9SDimitry Andric   /// shorten the iteration-count-check path length for the cases where the
872e8d8bef9SDimitry Andric   /// iteration count of the loop is so small that the main vector loop is
873e8d8bef9SDimitry Andric   /// completely skipped.
874e8d8bef9SDimitry Andric   EpilogueLoopVectorizationInfo &EPI;
875e8d8bef9SDimitry Andric };
876e8d8bef9SDimitry Andric 
877e8d8bef9SDimitry Andric /// A specialized derived class of inner loop vectorizer that performs
878e8d8bef9SDimitry Andric /// vectorization of *main* loops in the process of vectorizing loops and their
879e8d8bef9SDimitry Andric /// epilogues.
880e8d8bef9SDimitry Andric class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
881e8d8bef9SDimitry Andric public:
EpilogueVectorizerMainLoop(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,EpilogueLoopVectorizationInfo & EPI,LoopVectorizationLegality * LVL,llvm::LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,GeneratedRTChecks & Check)882e8d8bef9SDimitry Andric   EpilogueVectorizerMainLoop(
883e8d8bef9SDimitry Andric       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
884e8d8bef9SDimitry Andric       DominatorTree *DT, const TargetLibraryInfo *TLI,
885e8d8bef9SDimitry Andric       const TargetTransformInfo *TTI, AssumptionCache *AC,
886e8d8bef9SDimitry Andric       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
887e8d8bef9SDimitry Andric       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
888fe6060f1SDimitry Andric       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
889fe6060f1SDimitry Andric       GeneratedRTChecks &Check)
890e8d8bef9SDimitry Andric       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
891fe6060f1SDimitry Andric                                        EPI, LVL, CM, BFI, PSI, Check) {}
892e8d8bef9SDimitry Andric   /// Implements the interface for creating a vectorized skeleton using the
893e8d8bef9SDimitry Andric   /// *main loop* strategy (ie the first pass of vplan execution).
894fe013be4SDimitry Andric   std::pair<BasicBlock *, Value *>
895fe013be4SDimitry Andric   createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
896e8d8bef9SDimitry Andric 
897e8d8bef9SDimitry Andric protected:
898e8d8bef9SDimitry Andric   /// Emits an iteration count bypass check once for the main loop (when \p
899e8d8bef9SDimitry Andric   /// ForEpilogue is false) and once for the epilogue loop (when \p
900e8d8bef9SDimitry Andric   /// ForEpilogue is true).
90181ad6265SDimitry Andric   BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
902e8d8bef9SDimitry Andric   void printDebugTracesAtStart() override;
903e8d8bef9SDimitry Andric   void printDebugTracesAtEnd() override;
904e8d8bef9SDimitry Andric };
905e8d8bef9SDimitry Andric 
906e8d8bef9SDimitry Andric // A specialized derived class of inner loop vectorizer that performs
907e8d8bef9SDimitry Andric // vectorization of *epilogue* loops in the process of vectorizing loops and
908e8d8bef9SDimitry Andric // their epilogues.
909e8d8bef9SDimitry Andric class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
910e8d8bef9SDimitry Andric public:
EpilogueVectorizerEpilogueLoop(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,EpilogueLoopVectorizationInfo & EPI,LoopVectorizationLegality * LVL,llvm::LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,GeneratedRTChecks & Checks)911fe6060f1SDimitry Andric   EpilogueVectorizerEpilogueLoop(
912fe6060f1SDimitry Andric       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
913fe6060f1SDimitry Andric       DominatorTree *DT, const TargetLibraryInfo *TLI,
914e8d8bef9SDimitry Andric       const TargetTransformInfo *TTI, AssumptionCache *AC,
915fe6060f1SDimitry Andric       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
916fe6060f1SDimitry Andric       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
917fe6060f1SDimitry Andric       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
918fe6060f1SDimitry Andric       GeneratedRTChecks &Checks)
919e8d8bef9SDimitry Andric       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
92081ad6265SDimitry Andric                                        EPI, LVL, CM, BFI, PSI, Checks) {
92181ad6265SDimitry Andric     TripCount = EPI.TripCount;
92281ad6265SDimitry Andric   }
923e8d8bef9SDimitry Andric   /// Implements the interface for creating a vectorized skeleton using the
924e8d8bef9SDimitry Andric   /// *epilogue loop* strategy (ie the second pass of vplan execution).
925fe013be4SDimitry Andric   std::pair<BasicBlock *, Value *>
926fe013be4SDimitry Andric   createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
927e8d8bef9SDimitry Andric 
928e8d8bef9SDimitry Andric protected:
929e8d8bef9SDimitry Andric   /// Emits an iteration count bypass check after the main vector loop has
930e8d8bef9SDimitry Andric   /// finished to see if there are any iterations left to execute by either
931e8d8bef9SDimitry Andric   /// the vector epilogue or the scalar epilogue.
93281ad6265SDimitry Andric   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
933e8d8bef9SDimitry Andric                                                       BasicBlock *Bypass,
934e8d8bef9SDimitry Andric                                                       BasicBlock *Insert);
935e8d8bef9SDimitry Andric   void printDebugTracesAtStart() override;
936e8d8bef9SDimitry Andric   void printDebugTracesAtEnd() override;
937e8d8bef9SDimitry Andric };
9380b57cec5SDimitry Andric } // end namespace llvm
9390b57cec5SDimitry Andric 
9400b57cec5SDimitry Andric /// Look for a meaningful debug location on the instruction or it's
9410b57cec5SDimitry Andric /// operands.
getDebugLocFromInstOrOperands(Instruction * I)942c9157d92SDimitry Andric static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) {
9430b57cec5SDimitry Andric   if (!I)
944c9157d92SDimitry Andric     return DebugLoc();
9450b57cec5SDimitry Andric 
9460b57cec5SDimitry Andric   DebugLoc Empty;
9470b57cec5SDimitry Andric   if (I->getDebugLoc() != Empty)
948c9157d92SDimitry Andric     return I->getDebugLoc();
9490b57cec5SDimitry Andric 
950fe6060f1SDimitry Andric   for (Use &Op : I->operands()) {
951fe6060f1SDimitry Andric     if (Instruction *OpInst = dyn_cast<Instruction>(Op))
9520b57cec5SDimitry Andric       if (OpInst->getDebugLoc() != Empty)
953c9157d92SDimitry Andric         return OpInst->getDebugLoc();
9540b57cec5SDimitry Andric   }
9550b57cec5SDimitry Andric 
956c9157d92SDimitry Andric   return I->getDebugLoc();
9570b57cec5SDimitry Andric }
9580b57cec5SDimitry Andric 
959fe6060f1SDimitry Andric /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
960fe6060f1SDimitry Andric /// is passed, the message relates to that particular instruction.
9618bcb0991SDimitry Andric #ifndef NDEBUG
debugVectorizationMessage(const StringRef Prefix,const StringRef DebugMsg,Instruction * I)962fe6060f1SDimitry Andric static void debugVectorizationMessage(const StringRef Prefix,
963fe6060f1SDimitry Andric                                       const StringRef DebugMsg,
9648bcb0991SDimitry Andric                                       Instruction *I) {
965fe6060f1SDimitry Andric   dbgs() << "LV: " << Prefix << DebugMsg;
9668bcb0991SDimitry Andric   if (I != nullptr)
9678bcb0991SDimitry Andric     dbgs() << " " << *I;
9688bcb0991SDimitry Andric   else
9698bcb0991SDimitry Andric     dbgs() << '.';
9708bcb0991SDimitry Andric   dbgs() << '\n';
9718bcb0991SDimitry Andric }
9728bcb0991SDimitry Andric #endif
9738bcb0991SDimitry Andric 
9748bcb0991SDimitry Andric /// Create an analysis remark that explains why vectorization failed
9758bcb0991SDimitry Andric ///
9768bcb0991SDimitry Andric /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
9778bcb0991SDimitry Andric /// RemarkName is the identifier for the remark.  If \p I is passed it is an
9788bcb0991SDimitry Andric /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
9798bcb0991SDimitry Andric /// the location of the remark.  \return the remark object that can be
9808bcb0991SDimitry Andric /// streamed to.
createLVAnalysis(const char * PassName,StringRef RemarkName,Loop * TheLoop,Instruction * I)9818bcb0991SDimitry Andric static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
9828bcb0991SDimitry Andric     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
9838bcb0991SDimitry Andric   Value *CodeRegion = TheLoop->getHeader();
9848bcb0991SDimitry Andric   DebugLoc DL = TheLoop->getStartLoc();
9858bcb0991SDimitry Andric 
9868bcb0991SDimitry Andric   if (I) {
9878bcb0991SDimitry Andric     CodeRegion = I->getParent();
9888bcb0991SDimitry Andric     // If there is no debug location attached to the instruction, revert back to
9898bcb0991SDimitry Andric     // using the loop's.
9908bcb0991SDimitry Andric     if (I->getDebugLoc())
9918bcb0991SDimitry Andric       DL = I->getDebugLoc();
9928bcb0991SDimitry Andric   }
9938bcb0991SDimitry Andric 
994fe6060f1SDimitry Andric   return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
9958bcb0991SDimitry Andric }
9968bcb0991SDimitry Andric 
99704eeddc0SDimitry Andric namespace llvm {
99804eeddc0SDimitry Andric 
999e8d8bef9SDimitry Andric /// Return a value for Step multiplied by VF.
createStepForVF(IRBuilderBase & B,Type * Ty,ElementCount VF,int64_t Step)100081ad6265SDimitry Andric Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
1001349cc55cSDimitry Andric                        int64_t Step) {
1002349cc55cSDimitry Andric   assert(Ty->isIntegerTy() && "Expected an integer step");
1003fe013be4SDimitry Andric   return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step));
1004e8d8bef9SDimitry Andric }
1005e8d8bef9SDimitry Andric 
1006fe6060f1SDimitry Andric /// Return the runtime value for VF.
getRuntimeVF(IRBuilderBase & B,Type * Ty,ElementCount VF)100781ad6265SDimitry Andric Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
1008fe013be4SDimitry Andric   return B.CreateElementCount(Ty, VF);
1009fe6060f1SDimitry Andric }
1010fe6060f1SDimitry Andric 
createTripCountSCEV(Type * IdxTy,PredicatedScalarEvolution & PSE,Loop * OrigLoop)1011fe013be4SDimitry Andric const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE,
1012fe013be4SDimitry Andric                                 Loop *OrigLoop) {
1013bdd1243dSDimitry Andric   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
1014bdd1243dSDimitry Andric   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count");
1015bdd1243dSDimitry Andric 
1016bdd1243dSDimitry Andric   ScalarEvolution &SE = *PSE.getSE();
1017fe013be4SDimitry Andric   return SE.getTripCountFromExitCount(BackedgeTakenCount, IdxTy, OrigLoop);
1018bdd1243dSDimitry Andric }
1019bdd1243dSDimitry Andric 
reportVectorizationFailure(const StringRef DebugMsg,const StringRef OREMsg,const StringRef ORETag,OptimizationRemarkEmitter * ORE,Loop * TheLoop,Instruction * I)10208bcb0991SDimitry Andric void reportVectorizationFailure(const StringRef DebugMsg,
10218bcb0991SDimitry Andric                                 const StringRef OREMsg, const StringRef ORETag,
1022fe6060f1SDimitry Andric                                 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1023fe6060f1SDimitry Andric                                 Instruction *I) {
1024fe6060f1SDimitry Andric   LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
10258bcb0991SDimitry Andric   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1026fe6060f1SDimitry Andric   ORE->emit(
1027fe6060f1SDimitry Andric       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1028fe6060f1SDimitry Andric       << "loop not vectorized: " << OREMsg);
1029fe6060f1SDimitry Andric }
1030fe6060f1SDimitry Andric 
reportVectorizationInfo(const StringRef Msg,const StringRef ORETag,OptimizationRemarkEmitter * ORE,Loop * TheLoop,Instruction * I)1031fe6060f1SDimitry Andric void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1032fe6060f1SDimitry Andric                              OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1033fe6060f1SDimitry Andric                              Instruction *I) {
1034fe6060f1SDimitry Andric   LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
1035fe6060f1SDimitry Andric   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1036fe6060f1SDimitry Andric   ORE->emit(
1037fe6060f1SDimitry Andric       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1038fe6060f1SDimitry Andric       << Msg);
10398bcb0991SDimitry Andric }
10408bcb0991SDimitry Andric 
1041c9157d92SDimitry Andric /// Report successful vectorization of the loop. In case an outer loop is
1042c9157d92SDimitry Andric /// vectorized, prepend "outer" to the vectorization remark.
reportVectorization(OptimizationRemarkEmitter * ORE,Loop * TheLoop,VectorizationFactor VF,unsigned IC)1043c9157d92SDimitry Andric static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1044c9157d92SDimitry Andric                                 VectorizationFactor VF, unsigned IC) {
1045c9157d92SDimitry Andric   LLVM_DEBUG(debugVectorizationMessage(
1046c9157d92SDimitry Andric       "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
1047c9157d92SDimitry Andric       nullptr));
1048c9157d92SDimitry Andric   StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
1049c9157d92SDimitry Andric   ORE->emit([&]() {
1050c9157d92SDimitry Andric     return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
1051c9157d92SDimitry Andric                               TheLoop->getHeader())
1052c9157d92SDimitry Andric            << "vectorized " << LoopType << "loop (vectorization width: "
1053c9157d92SDimitry Andric            << ore::NV("VectorizationFactor", VF.Width)
1054c9157d92SDimitry Andric            << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
1055c9157d92SDimitry Andric   });
1056c9157d92SDimitry Andric }
1057c9157d92SDimitry Andric 
10588bcb0991SDimitry Andric } // end namespace llvm
10598bcb0991SDimitry Andric 
10600b57cec5SDimitry Andric #ifndef NDEBUG
10610b57cec5SDimitry Andric /// \return string containing a file name and a line # for the given loop.
getDebugLocString(const Loop * L)10620b57cec5SDimitry Andric static std::string getDebugLocString(const Loop *L) {
10630b57cec5SDimitry Andric   std::string Result;
10640b57cec5SDimitry Andric   if (L) {
10650b57cec5SDimitry Andric     raw_string_ostream OS(Result);
10660b57cec5SDimitry Andric     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
10670b57cec5SDimitry Andric       LoopDbgLoc.print(OS);
10680b57cec5SDimitry Andric     else
10690b57cec5SDimitry Andric       // Just print the module name.
10700b57cec5SDimitry Andric       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
10710b57cec5SDimitry Andric     OS.flush();
10720b57cec5SDimitry Andric   }
10730b57cec5SDimitry Andric   return Result;
10740b57cec5SDimitry Andric }
10750b57cec5SDimitry Andric #endif
10760b57cec5SDimitry Andric 
collectPoisonGeneratingRecipes(VPTransformState & State)10774824e7fdSDimitry Andric void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
10784824e7fdSDimitry Andric     VPTransformState &State) {
10794824e7fdSDimitry Andric 
10804824e7fdSDimitry Andric   // Collect recipes in the backward slice of `Root` that may generate a poison
10814824e7fdSDimitry Andric   // value that is used after vectorization.
10824824e7fdSDimitry Andric   SmallPtrSet<VPRecipeBase *, 16> Visited;
10834824e7fdSDimitry Andric   auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
10844824e7fdSDimitry Andric     SmallVector<VPRecipeBase *, 16> Worklist;
10854824e7fdSDimitry Andric     Worklist.push_back(Root);
10864824e7fdSDimitry Andric 
10874824e7fdSDimitry Andric     // Traverse the backward slice of Root through its use-def chain.
10884824e7fdSDimitry Andric     while (!Worklist.empty()) {
10894824e7fdSDimitry Andric       VPRecipeBase *CurRec = Worklist.back();
10904824e7fdSDimitry Andric       Worklist.pop_back();
10914824e7fdSDimitry Andric 
10924824e7fdSDimitry Andric       if (!Visited.insert(CurRec).second)
10934824e7fdSDimitry Andric         continue;
10944824e7fdSDimitry Andric 
10954824e7fdSDimitry Andric       // Prune search if we find another recipe generating a widen memory
10964824e7fdSDimitry Andric       // instruction. Widen memory instructions involved in address computation
10974824e7fdSDimitry Andric       // will lead to gather/scatter instructions, which don't need to be
10984824e7fdSDimitry Andric       // handled.
10994824e7fdSDimitry Andric       if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
110004eeddc0SDimitry Andric           isa<VPInterleaveRecipe>(CurRec) ||
110181ad6265SDimitry Andric           isa<VPScalarIVStepsRecipe>(CurRec) ||
1102753f127fSDimitry Andric           isa<VPCanonicalIVPHIRecipe>(CurRec) ||
1103753f127fSDimitry Andric           isa<VPActiveLaneMaskPHIRecipe>(CurRec))
11044824e7fdSDimitry Andric         continue;
11054824e7fdSDimitry Andric 
11064824e7fdSDimitry Andric       // This recipe contributes to the address computation of a widen
1107fe013be4SDimitry Andric       // load/store. If the underlying instruction has poison-generating flags,
1108fe013be4SDimitry Andric       // drop them directly.
1109fe013be4SDimitry Andric       if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
1110fe013be4SDimitry Andric         RecWithFlags->dropPoisonGeneratingFlags();
1111fe013be4SDimitry Andric       } else {
1112c9157d92SDimitry Andric         Instruction *Instr = dyn_cast_or_null<Instruction>(
1113c9157d92SDimitry Andric             CurRec->getVPSingleValue()->getUnderlyingValue());
1114fe013be4SDimitry Andric         (void)Instr;
1115fe013be4SDimitry Andric         assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
1116fe013be4SDimitry Andric                "found instruction with poison generating flags not covered by "
1117fe013be4SDimitry Andric                "VPRecipeWithIRFlags");
1118fe013be4SDimitry Andric       }
11194824e7fdSDimitry Andric 
11204824e7fdSDimitry Andric       // Add new definitions to the worklist.
11214824e7fdSDimitry Andric       for (VPValue *operand : CurRec->operands())
1122bdd1243dSDimitry Andric         if (VPRecipeBase *OpDef = operand->getDefiningRecipe())
1123bdd1243dSDimitry Andric           Worklist.push_back(OpDef);
11244824e7fdSDimitry Andric     }
11254824e7fdSDimitry Andric   });
11264824e7fdSDimitry Andric 
11274824e7fdSDimitry Andric   // Traverse all the recipes in the VPlan and collect the poison-generating
11284824e7fdSDimitry Andric   // recipes in the backward slice starting at the address of a VPWidenRecipe or
11294824e7fdSDimitry Andric   // VPInterleaveRecipe.
1130bdd1243dSDimitry Andric   auto Iter = vp_depth_first_deep(State.Plan->getEntry());
11314824e7fdSDimitry Andric   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
11324824e7fdSDimitry Andric     for (VPRecipeBase &Recipe : *VPBB) {
11334824e7fdSDimitry Andric       if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
113481ad6265SDimitry Andric         Instruction &UnderlyingInstr = WidenRec->getIngredient();
1135bdd1243dSDimitry Andric         VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
113681ad6265SDimitry Andric         if (AddrDef && WidenRec->isConsecutive() &&
113781ad6265SDimitry Andric             Legal->blockNeedsPredication(UnderlyingInstr.getParent()))
1138bdd1243dSDimitry Andric           collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
11394824e7fdSDimitry Andric       } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
1140bdd1243dSDimitry Andric         VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
11414824e7fdSDimitry Andric         if (AddrDef) {
11424824e7fdSDimitry Andric           // Check if any member of the interleave group needs predication.
11434824e7fdSDimitry Andric           const InterleaveGroup<Instruction> *InterGroup =
11444824e7fdSDimitry Andric               InterleaveRec->getInterleaveGroup();
11454824e7fdSDimitry Andric           bool NeedPredication = false;
11464824e7fdSDimitry Andric           for (int I = 0, NumMembers = InterGroup->getNumMembers();
11474824e7fdSDimitry Andric                I < NumMembers; ++I) {
11484824e7fdSDimitry Andric             Instruction *Member = InterGroup->getMember(I);
11494824e7fdSDimitry Andric             if (Member)
11504824e7fdSDimitry Andric               NeedPredication |=
11514824e7fdSDimitry Andric                   Legal->blockNeedsPredication(Member->getParent());
11524824e7fdSDimitry Andric           }
11534824e7fdSDimitry Andric 
11544824e7fdSDimitry Andric           if (NeedPredication)
1155bdd1243dSDimitry Andric             collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
11564824e7fdSDimitry Andric         }
11574824e7fdSDimitry Andric       }
11584824e7fdSDimitry Andric     }
11594824e7fdSDimitry Andric   }
11604824e7fdSDimitry Andric }
11614824e7fdSDimitry Andric 
11620b57cec5SDimitry Andric namespace llvm {
11630b57cec5SDimitry Andric 
11648bcb0991SDimitry Andric // Loop vectorization cost-model hints how the scalar epilogue loop should be
11658bcb0991SDimitry Andric // lowered.
11668bcb0991SDimitry Andric enum ScalarEpilogueLowering {
11678bcb0991SDimitry Andric 
11688bcb0991SDimitry Andric   // The default: allowing scalar epilogues.
11698bcb0991SDimitry Andric   CM_ScalarEpilogueAllowed,
11708bcb0991SDimitry Andric 
11718bcb0991SDimitry Andric   // Vectorization with OptForSize: don't allow epilogues.
11728bcb0991SDimitry Andric   CM_ScalarEpilogueNotAllowedOptSize,
11738bcb0991SDimitry Andric 
11748bcb0991SDimitry Andric   // A special case of vectorisation with OptForSize: loops with a very small
11758bcb0991SDimitry Andric   // trip count are considered for vectorization under OptForSize, thereby
11768bcb0991SDimitry Andric   // making sure the cost of their loop body is dominant, free of runtime
11778bcb0991SDimitry Andric   // guards and scalar iteration overheads.
11788bcb0991SDimitry Andric   CM_ScalarEpilogueNotAllowedLowTripLoop,
11798bcb0991SDimitry Andric 
11808bcb0991SDimitry Andric   // Loop hint predicate indicating an epilogue is undesired.
1181e8d8bef9SDimitry Andric   CM_ScalarEpilogueNotNeededUsePredicate,
1182e8d8bef9SDimitry Andric 
1183e8d8bef9SDimitry Andric   // Directive indicating we must either tail fold or not vectorize
1184e8d8bef9SDimitry Andric   CM_ScalarEpilogueNotAllowedUsePredicate
11858bcb0991SDimitry Andric };
11868bcb0991SDimitry Andric 
1187fe013be4SDimitry Andric using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1188fe6060f1SDimitry Andric 
11890b57cec5SDimitry Andric /// LoopVectorizationCostModel - estimates the expected speedups due to
11900b57cec5SDimitry Andric /// vectorization.
11910b57cec5SDimitry Andric /// In many cases vectorization is not profitable. This can happen because of
11920b57cec5SDimitry Andric /// a number of reasons. In this class we mainly attempt to predict the
11930b57cec5SDimitry Andric /// expected speedup/slowdowns due to the supported instruction set. We use the
11940b57cec5SDimitry Andric /// TargetTransformInfo to query the different backends for the cost of
11950b57cec5SDimitry Andric /// different operations.
11960b57cec5SDimitry Andric class LoopVectorizationCostModel {
11970b57cec5SDimitry Andric public:
LoopVectorizationCostModel(ScalarEpilogueLowering SEL,Loop * L,PredicatedScalarEvolution & PSE,LoopInfo * LI,LoopVectorizationLegality * Legal,const TargetTransformInfo & TTI,const TargetLibraryInfo * TLI,DemandedBits * DB,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,const Function * F,const LoopVectorizeHints * Hints,InterleavedAccessInfo & IAI)11988bcb0991SDimitry Andric   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
11998bcb0991SDimitry Andric                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
12008bcb0991SDimitry Andric                              LoopVectorizationLegality *Legal,
12010b57cec5SDimitry Andric                              const TargetTransformInfo &TTI,
12020b57cec5SDimitry Andric                              const TargetLibraryInfo *TLI, DemandedBits *DB,
12030b57cec5SDimitry Andric                              AssumptionCache *AC,
12040b57cec5SDimitry Andric                              OptimizationRemarkEmitter *ORE, const Function *F,
12050b57cec5SDimitry Andric                              const LoopVectorizeHints *Hints,
12060b57cec5SDimitry Andric                              InterleavedAccessInfo &IAI)
12078bcb0991SDimitry Andric       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
12088bcb0991SDimitry Andric         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
12098bcb0991SDimitry Andric         Hints(Hints), InterleaveInfo(IAI) {}
12100b57cec5SDimitry Andric 
1211fe6060f1SDimitry Andric   /// \return An upper bound for the vectorization factors (both fixed and
1212fe6060f1SDimitry Andric   /// scalable). If the factors are 0, vectorization and interleaving should be
1213fe6060f1SDimitry Andric   /// avoided up front.
1214fe6060f1SDimitry Andric   FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
12158bcb0991SDimitry Andric 
12168bcb0991SDimitry Andric   /// \return True if runtime checks are required for vectorization, and false
12178bcb0991SDimitry Andric   /// otherwise.
12188bcb0991SDimitry Andric   bool runtimeChecksRequired();
12190b57cec5SDimitry Andric 
12200b57cec5SDimitry Andric   /// Setup cost-based decisions for user vectorization factor.
1221fe6060f1SDimitry Andric   /// \return true if the UserVF is a feasible VF to be chosen.
selectUserVectorizationFactor(ElementCount UserVF)1222fe6060f1SDimitry Andric   bool selectUserVectorizationFactor(ElementCount UserVF) {
12230b57cec5SDimitry Andric     collectUniformsAndScalars(UserVF);
12240b57cec5SDimitry Andric     collectInstsToScalarize(UserVF);
1225fe6060f1SDimitry Andric     return expectedCost(UserVF).first.isValid();
12260b57cec5SDimitry Andric   }
12270b57cec5SDimitry Andric 
12280b57cec5SDimitry Andric   /// \return The size (in bits) of the smallest and widest types in the code
12290b57cec5SDimitry Andric   /// that needs to be vectorized. We ignore values that remain scalar such as
12300b57cec5SDimitry Andric   /// 64 bit loop indices.
12310b57cec5SDimitry Andric   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
12320b57cec5SDimitry Andric 
12330b57cec5SDimitry Andric   /// \return The desired interleave count.
12340b57cec5SDimitry Andric   /// If interleave count has been specified by metadata it will be returned.
12350b57cec5SDimitry Andric   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
12360b57cec5SDimitry Andric   /// are the selected vectorization factor and the cost of the selected VF.
1237bdd1243dSDimitry Andric   unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
12380b57cec5SDimitry Andric 
12390b57cec5SDimitry Andric   /// Memory access instruction may be vectorized in more than one way.
12400b57cec5SDimitry Andric   /// Form of instruction after vectorization depends on cost.
12410b57cec5SDimitry Andric   /// This function takes cost-based decisions for Load/Store instructions
12420b57cec5SDimitry Andric   /// and collects them in a map. This decisions map is used for building
12430b57cec5SDimitry Andric   /// the lists of loop-uniform and loop-scalar instructions.
12440b57cec5SDimitry Andric   /// The calculated cost is saved with widening decision in order to
12450b57cec5SDimitry Andric   /// avoid redundant calculations.
1246e8d8bef9SDimitry Andric   void setCostBasedWideningDecision(ElementCount VF);
12470b57cec5SDimitry Andric 
1248c9157d92SDimitry Andric   /// A call may be vectorized in different ways depending on whether we have
1249c9157d92SDimitry Andric   /// vectorized variants available and whether the target supports masking.
1250c9157d92SDimitry Andric   /// This function analyzes all calls in the function at the supplied VF,
1251c9157d92SDimitry Andric   /// makes a decision based on the costs of available options, and stores that
1252c9157d92SDimitry Andric   /// decision in a map for use in planning and plan execution.
1253c9157d92SDimitry Andric   void setVectorizedCallDecision(ElementCount VF);
1254c9157d92SDimitry Andric 
12550b57cec5SDimitry Andric   /// A struct that represents some properties of the register usage
12560b57cec5SDimitry Andric   /// of a loop.
12570b57cec5SDimitry Andric   struct RegisterUsage {
12580b57cec5SDimitry Andric     /// Holds the number of loop invariant values that are used in the loop.
12598bcb0991SDimitry Andric     /// The key is ClassID of target-provided register class.
12608bcb0991SDimitry Andric     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
12610b57cec5SDimitry Andric     /// Holds the maximum number of concurrent live intervals in the loop.
12628bcb0991SDimitry Andric     /// The key is ClassID of target-provided register class.
12638bcb0991SDimitry Andric     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
12640b57cec5SDimitry Andric   };
12650b57cec5SDimitry Andric 
12660b57cec5SDimitry Andric   /// \return Returns information about the register usages of the loop for the
12670b57cec5SDimitry Andric   /// given vectorization factors.
1268e8d8bef9SDimitry Andric   SmallVector<RegisterUsage, 8>
1269e8d8bef9SDimitry Andric   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
12700b57cec5SDimitry Andric 
12710b57cec5SDimitry Andric   /// Collect values we want to ignore in the cost model.
12720b57cec5SDimitry Andric   void collectValuesToIgnore();
12730b57cec5SDimitry Andric 
1274fe6060f1SDimitry Andric   /// Collect all element types in the loop for which widening is needed.
1275fe6060f1SDimitry Andric   void collectElementTypesForWidening();
1276fe6060f1SDimitry Andric 
1277e8d8bef9SDimitry Andric   /// Split reductions into those that happen in the loop, and those that happen
1278c9157d92SDimitry Andric   /// outside. In loop reductions are collected into InLoopReductions.
1279e8d8bef9SDimitry Andric   void collectInLoopReductions();
1280e8d8bef9SDimitry Andric 
1281fe6060f1SDimitry Andric   /// Returns true if we should use strict in-order reductions for the given
1282fe6060f1SDimitry Andric   /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1283fe6060f1SDimitry Andric   /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1284fe6060f1SDimitry Andric   /// of FP operations.
useOrderedReductions(const RecurrenceDescriptor & RdxDesc) const128581ad6265SDimitry Andric   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1286349cc55cSDimitry Andric     return !Hints->allowReordering() && RdxDesc.isOrdered();
1287fe6060f1SDimitry Andric   }
1288fe6060f1SDimitry Andric 
12890b57cec5SDimitry Andric   /// \returns The smallest bitwidth each instruction can be represented with.
12900b57cec5SDimitry Andric   /// The vector equivalents of these instructions should be truncated to this
12910b57cec5SDimitry Andric   /// type.
getMinimalBitwidths() const12920b57cec5SDimitry Andric   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
12930b57cec5SDimitry Andric     return MinBWs;
12940b57cec5SDimitry Andric   }
12950b57cec5SDimitry Andric 
12960b57cec5SDimitry Andric   /// \returns True if it is more profitable to scalarize instruction \p I for
12970b57cec5SDimitry Andric   /// vectorization factor \p VF.
isProfitableToScalarize(Instruction * I,ElementCount VF) const1298e8d8bef9SDimitry Andric   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1299e8d8bef9SDimitry Andric     assert(VF.isVector() &&
1300e8d8bef9SDimitry Andric            "Profitable to scalarize relevant only for VF > 1.");
13010b57cec5SDimitry Andric 
13020b57cec5SDimitry Andric     // Cost model is not run in the VPlan-native path - return conservative
13030b57cec5SDimitry Andric     // result until this changes.
13040b57cec5SDimitry Andric     if (EnableVPlanNativePath)
13050b57cec5SDimitry Andric       return false;
13060b57cec5SDimitry Andric 
13070b57cec5SDimitry Andric     auto Scalars = InstsToScalarize.find(VF);
13080b57cec5SDimitry Andric     assert(Scalars != InstsToScalarize.end() &&
13090b57cec5SDimitry Andric            "VF not yet analyzed for scalarization profitability");
1310fe013be4SDimitry Andric     return Scalars->second.contains(I);
13110b57cec5SDimitry Andric   }
13120b57cec5SDimitry Andric 
13130b57cec5SDimitry Andric   /// Returns true if \p I is known to be uniform after vectorization.
isUniformAfterVectorization(Instruction * I,ElementCount VF) const1314e8d8bef9SDimitry Andric   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1315fe013be4SDimitry Andric     // Pseudo probe needs to be duplicated for each unrolled iteration and
1316fe013be4SDimitry Andric     // vector lane so that profiled loop trip count can be accurately
1317fe013be4SDimitry Andric     // accumulated instead of being under counted.
1318fe013be4SDimitry Andric     if (isa<PseudoProbeInst>(I))
1319fe013be4SDimitry Andric       return false;
1320fe013be4SDimitry Andric 
1321e8d8bef9SDimitry Andric     if (VF.isScalar())
13220b57cec5SDimitry Andric       return true;
13230b57cec5SDimitry Andric 
13240b57cec5SDimitry Andric     // Cost model is not run in the VPlan-native path - return conservative
13250b57cec5SDimitry Andric     // result until this changes.
13260b57cec5SDimitry Andric     if (EnableVPlanNativePath)
13270b57cec5SDimitry Andric       return false;
13280b57cec5SDimitry Andric 
13290b57cec5SDimitry Andric     auto UniformsPerVF = Uniforms.find(VF);
13300b57cec5SDimitry Andric     assert(UniformsPerVF != Uniforms.end() &&
13310b57cec5SDimitry Andric            "VF not yet analyzed for uniformity");
13325ffd83dbSDimitry Andric     return UniformsPerVF->second.count(I);
13330b57cec5SDimitry Andric   }
13340b57cec5SDimitry Andric 
13350b57cec5SDimitry Andric   /// Returns true if \p I is known to be scalar after vectorization.
isScalarAfterVectorization(Instruction * I,ElementCount VF) const1336e8d8bef9SDimitry Andric   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1337e8d8bef9SDimitry Andric     if (VF.isScalar())
13380b57cec5SDimitry Andric       return true;
13390b57cec5SDimitry Andric 
13400b57cec5SDimitry Andric     // Cost model is not run in the VPlan-native path - return conservative
13410b57cec5SDimitry Andric     // result until this changes.
13420b57cec5SDimitry Andric     if (EnableVPlanNativePath)
13430b57cec5SDimitry Andric       return false;
13440b57cec5SDimitry Andric 
13450b57cec5SDimitry Andric     auto ScalarsPerVF = Scalars.find(VF);
13460b57cec5SDimitry Andric     assert(ScalarsPerVF != Scalars.end() &&
13470b57cec5SDimitry Andric            "Scalar values are not calculated for VF");
13485ffd83dbSDimitry Andric     return ScalarsPerVF->second.count(I);
13490b57cec5SDimitry Andric   }
13500b57cec5SDimitry Andric 
13510b57cec5SDimitry Andric   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
13520b57cec5SDimitry Andric   /// for vectorization factor \p VF.
canTruncateToMinimalBitwidth(Instruction * I,ElementCount VF) const1353e8d8bef9SDimitry Andric   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1354fe013be4SDimitry Andric     return VF.isVector() && MinBWs.contains(I) &&
13550b57cec5SDimitry Andric            !isProfitableToScalarize(I, VF) &&
13560b57cec5SDimitry Andric            !isScalarAfterVectorization(I, VF);
13570b57cec5SDimitry Andric   }
13580b57cec5SDimitry Andric 
13590b57cec5SDimitry Andric   /// Decision that was taken during cost calculation for memory instruction.
13600b57cec5SDimitry Andric   enum InstWidening {
13610b57cec5SDimitry Andric     CM_Unknown,
13620b57cec5SDimitry Andric     CM_Widen,         // For consecutive accesses with stride +1.
13630b57cec5SDimitry Andric     CM_Widen_Reverse, // For consecutive accesses with stride -1.
13640b57cec5SDimitry Andric     CM_Interleave,
13650b57cec5SDimitry Andric     CM_GatherScatter,
1366c9157d92SDimitry Andric     CM_Scalarize,
1367c9157d92SDimitry Andric     CM_VectorCall,
1368c9157d92SDimitry Andric     CM_IntrinsicCall
13690b57cec5SDimitry Andric   };
13700b57cec5SDimitry Andric 
13710b57cec5SDimitry Andric   /// Save vectorization decision \p W and \p Cost taken by the cost model for
13720b57cec5SDimitry Andric   /// instruction \p I and vector width \p VF.
setWideningDecision(Instruction * I,ElementCount VF,InstWidening W,InstructionCost Cost)1373e8d8bef9SDimitry Andric   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1374e8d8bef9SDimitry Andric                            InstructionCost Cost) {
1375e8d8bef9SDimitry Andric     assert(VF.isVector() && "Expected VF >=2");
13760b57cec5SDimitry Andric     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
13770b57cec5SDimitry Andric   }
13780b57cec5SDimitry Andric 
13790b57cec5SDimitry Andric   /// Save vectorization decision \p W and \p Cost taken by the cost model for
13800b57cec5SDimitry Andric   /// interleaving group \p Grp and vector width \p VF.
setWideningDecision(const InterleaveGroup<Instruction> * Grp,ElementCount VF,InstWidening W,InstructionCost Cost)1381e8d8bef9SDimitry Andric   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1382e8d8bef9SDimitry Andric                            ElementCount VF, InstWidening W,
1383e8d8bef9SDimitry Andric                            InstructionCost Cost) {
1384e8d8bef9SDimitry Andric     assert(VF.isVector() && "Expected VF >=2");
13850b57cec5SDimitry Andric     /// Broadcast this decicion to all instructions inside the group.
13860b57cec5SDimitry Andric     /// But the cost will be assigned to one instruction only.
13870b57cec5SDimitry Andric     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
13880b57cec5SDimitry Andric       if (auto *I = Grp->getMember(i)) {
13890b57cec5SDimitry Andric         if (Grp->getInsertPos() == I)
13900b57cec5SDimitry Andric           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
13910b57cec5SDimitry Andric         else
13920b57cec5SDimitry Andric           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
13930b57cec5SDimitry Andric       }
13940b57cec5SDimitry Andric     }
13950b57cec5SDimitry Andric   }
13960b57cec5SDimitry Andric 
13970b57cec5SDimitry Andric   /// Return the cost model decision for the given instruction \p I and vector
13980b57cec5SDimitry Andric   /// width \p VF. Return CM_Unknown if this instruction did not pass
13990b57cec5SDimitry Andric   /// through the cost modeling.
getWideningDecision(Instruction * I,ElementCount VF) const1400fe6060f1SDimitry Andric   InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1401e8d8bef9SDimitry Andric     assert(VF.isVector() && "Expected VF to be a vector VF");
14020b57cec5SDimitry Andric     // Cost model is not run in the VPlan-native path - return conservative
14030b57cec5SDimitry Andric     // result until this changes.
14040b57cec5SDimitry Andric     if (EnableVPlanNativePath)
14050b57cec5SDimitry Andric       return CM_GatherScatter;
14060b57cec5SDimitry Andric 
1407e8d8bef9SDimitry Andric     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
14080b57cec5SDimitry Andric     auto Itr = WideningDecisions.find(InstOnVF);
14090b57cec5SDimitry Andric     if (Itr == WideningDecisions.end())
14100b57cec5SDimitry Andric       return CM_Unknown;
14110b57cec5SDimitry Andric     return Itr->second.first;
14120b57cec5SDimitry Andric   }
14130b57cec5SDimitry Andric 
14140b57cec5SDimitry Andric   /// Return the vectorization cost for the given instruction \p I and vector
14150b57cec5SDimitry Andric   /// width \p VF.
getWideningCost(Instruction * I,ElementCount VF)1416e8d8bef9SDimitry Andric   InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1417e8d8bef9SDimitry Andric     assert(VF.isVector() && "Expected VF >=2");
1418e8d8bef9SDimitry Andric     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1419fe013be4SDimitry Andric     assert(WideningDecisions.contains(InstOnVF) &&
14200b57cec5SDimitry Andric            "The cost is not calculated");
14210b57cec5SDimitry Andric     return WideningDecisions[InstOnVF].second;
14220b57cec5SDimitry Andric   }
14230b57cec5SDimitry Andric 
1424c9157d92SDimitry Andric   struct CallWideningDecision {
1425c9157d92SDimitry Andric     InstWidening Kind;
1426c9157d92SDimitry Andric     Function *Variant;
1427c9157d92SDimitry Andric     Intrinsic::ID IID;
1428c9157d92SDimitry Andric     std::optional<unsigned> MaskPos;
1429c9157d92SDimitry Andric     InstructionCost Cost;
1430c9157d92SDimitry Andric   };
1431c9157d92SDimitry Andric 
setCallWideningDecision(CallInst * CI,ElementCount VF,InstWidening Kind,Function * Variant,Intrinsic::ID IID,std::optional<unsigned> MaskPos,InstructionCost Cost)1432c9157d92SDimitry Andric   void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind,
1433c9157d92SDimitry Andric                                Function *Variant, Intrinsic::ID IID,
1434c9157d92SDimitry Andric                                std::optional<unsigned> MaskPos,
1435c9157d92SDimitry Andric                                InstructionCost Cost) {
1436c9157d92SDimitry Andric     assert(!VF.isScalar() && "Expected vector VF");
1437c9157d92SDimitry Andric     CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID,
1438c9157d92SDimitry Andric                                                      MaskPos, Cost};
1439c9157d92SDimitry Andric   }
1440c9157d92SDimitry Andric 
getCallWideningDecision(CallInst * CI,ElementCount VF) const1441c9157d92SDimitry Andric   CallWideningDecision getCallWideningDecision(CallInst *CI,
1442c9157d92SDimitry Andric                                                ElementCount VF) const {
1443c9157d92SDimitry Andric     assert(!VF.isScalar() && "Expected vector VF");
1444c9157d92SDimitry Andric     return CallWideningDecisions.at(std::make_pair(CI, VF));
1445c9157d92SDimitry Andric   }
1446c9157d92SDimitry Andric 
14470b57cec5SDimitry Andric   /// Return True if instruction \p I is an optimizable truncate whose operand
14480b57cec5SDimitry Andric   /// is an induction variable. Such a truncate will be removed by adding a new
14490b57cec5SDimitry Andric   /// induction variable with the destination type.
isOptimizableIVTruncate(Instruction * I,ElementCount VF)1450e8d8bef9SDimitry Andric   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
14510b57cec5SDimitry Andric     // If the instruction is not a truncate, return false.
14520b57cec5SDimitry Andric     auto *Trunc = dyn_cast<TruncInst>(I);
14530b57cec5SDimitry Andric     if (!Trunc)
14540b57cec5SDimitry Andric       return false;
14550b57cec5SDimitry Andric 
14560b57cec5SDimitry Andric     // Get the source and destination types of the truncate.
14570b57cec5SDimitry Andric     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
14580b57cec5SDimitry Andric     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
14590b57cec5SDimitry Andric 
14600b57cec5SDimitry Andric     // If the truncate is free for the given types, return false. Replacing a
14610b57cec5SDimitry Andric     // free truncate with an induction variable would add an induction variable
14620b57cec5SDimitry Andric     // update instruction to each iteration of the loop. We exclude from this
14630b57cec5SDimitry Andric     // check the primary induction variable since it will need an update
14640b57cec5SDimitry Andric     // instruction regardless.
14650b57cec5SDimitry Andric     Value *Op = Trunc->getOperand(0);
14660b57cec5SDimitry Andric     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
14670b57cec5SDimitry Andric       return false;
14680b57cec5SDimitry Andric 
14690b57cec5SDimitry Andric     // If the truncated value is not an induction variable, return false.
14700b57cec5SDimitry Andric     return Legal->isInductionPhi(Op);
14710b57cec5SDimitry Andric   }
14720b57cec5SDimitry Andric 
14730b57cec5SDimitry Andric   /// Collects the instructions to scalarize for each predicated instruction in
14740b57cec5SDimitry Andric   /// the loop.
1475e8d8bef9SDimitry Andric   void collectInstsToScalarize(ElementCount VF);
14760b57cec5SDimitry Andric 
14770b57cec5SDimitry Andric   /// Collect Uniform and Scalar values for the given \p VF.
14780b57cec5SDimitry Andric   /// The sets depend on CM decision for Load/Store instructions
14790b57cec5SDimitry Andric   /// that may be vectorized as interleave, gather-scatter or scalarized.
1480c9157d92SDimitry Andric   /// Also make a decision on what to do about call instructions in the loop
1481c9157d92SDimitry Andric   /// at that VF -- scalarize, call a known vector routine, or call a
1482c9157d92SDimitry Andric   /// vector intrinsic.
collectUniformsAndScalars(ElementCount VF)1483e8d8bef9SDimitry Andric   void collectUniformsAndScalars(ElementCount VF) {
14840b57cec5SDimitry Andric     // Do the analysis once.
1485fe013be4SDimitry Andric     if (VF.isScalar() || Uniforms.contains(VF))
14860b57cec5SDimitry Andric       return;
14870b57cec5SDimitry Andric     setCostBasedWideningDecision(VF);
1488c9157d92SDimitry Andric     setVectorizedCallDecision(VF);
14890b57cec5SDimitry Andric     collectLoopUniforms(VF);
14900b57cec5SDimitry Andric     collectLoopScalars(VF);
14910b57cec5SDimitry Andric   }
14920b57cec5SDimitry Andric 
14930b57cec5SDimitry Andric   /// Returns true if the target machine supports masked store operation
14940b57cec5SDimitry Andric   /// for the given \p DataType and kind of access to \p Ptr.
isLegalMaskedStore(Type * DataType,Value * Ptr,Align Alignment) const1495fe6060f1SDimitry Andric   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1496349cc55cSDimitry Andric     return Legal->isConsecutivePtr(DataType, Ptr) &&
14978bcb0991SDimitry Andric            TTI.isLegalMaskedStore(DataType, Alignment);
14980b57cec5SDimitry Andric   }
14990b57cec5SDimitry Andric 
15000b57cec5SDimitry Andric   /// Returns true if the target machine supports masked load operation
15010b57cec5SDimitry Andric   /// for the given \p DataType and kind of access to \p Ptr.
isLegalMaskedLoad(Type * DataType,Value * Ptr,Align Alignment) const1502fe6060f1SDimitry Andric   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1503349cc55cSDimitry Andric     return Legal->isConsecutivePtr(DataType, Ptr) &&
15048bcb0991SDimitry Andric            TTI.isLegalMaskedLoad(DataType, Alignment);
15050b57cec5SDimitry Andric   }
15060b57cec5SDimitry Andric 
15070b57cec5SDimitry Andric   /// Returns true if the target machine can represent \p V as a masked gather
15080b57cec5SDimitry Andric   /// or scatter operation.
isLegalGatherOrScatter(Value * V,ElementCount VF)1509fe013be4SDimitry Andric   bool isLegalGatherOrScatter(Value *V, ElementCount VF) {
15100b57cec5SDimitry Andric     bool LI = isa<LoadInst>(V);
15110b57cec5SDimitry Andric     bool SI = isa<StoreInst>(V);
15120b57cec5SDimitry Andric     if (!LI && !SI)
15130b57cec5SDimitry Andric       return false;
1514fe6060f1SDimitry Andric     auto *Ty = getLoadStoreType(V);
15155ffd83dbSDimitry Andric     Align Align = getLoadStoreAlignment(V);
151604eeddc0SDimitry Andric     if (VF.isVector())
151704eeddc0SDimitry Andric       Ty = VectorType::get(Ty, VF);
1518fe6060f1SDimitry Andric     return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1519fe6060f1SDimitry Andric            (SI && TTI.isLegalMaskedScatter(Ty, Align));
1520fe6060f1SDimitry Andric   }
1521fe6060f1SDimitry Andric 
1522fe6060f1SDimitry Andric   /// Returns true if the target machine supports all of the reduction
1523fe6060f1SDimitry Andric   /// variables found for the given VF.
canVectorizeReductions(ElementCount VF) const1524fe6060f1SDimitry Andric   bool canVectorizeReductions(ElementCount VF) const {
1525fe6060f1SDimitry Andric     return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1526fe6060f1SDimitry Andric       const RecurrenceDescriptor &RdxDesc = Reduction.second;
1527fe6060f1SDimitry Andric       return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1528fe6060f1SDimitry Andric     }));
15290b57cec5SDimitry Andric   }
15300b57cec5SDimitry Andric 
1531bdd1243dSDimitry Andric   /// Given costs for both strategies, return true if the scalar predication
1532bdd1243dSDimitry Andric   /// lowering should be used for div/rem.  This incorporates an override
1533bdd1243dSDimitry Andric   /// option so it is not simply a cost comparison.
isDivRemScalarWithPredication(InstructionCost ScalarCost,InstructionCost SafeDivisorCost) const1534bdd1243dSDimitry Andric   bool isDivRemScalarWithPredication(InstructionCost ScalarCost,
1535bdd1243dSDimitry Andric                                      InstructionCost SafeDivisorCost) const {
1536bdd1243dSDimitry Andric     switch (ForceSafeDivisor) {
1537bdd1243dSDimitry Andric     case cl::BOU_UNSET:
1538bdd1243dSDimitry Andric       return ScalarCost < SafeDivisorCost;
1539bdd1243dSDimitry Andric     case cl::BOU_TRUE:
1540bdd1243dSDimitry Andric       return false;
1541bdd1243dSDimitry Andric     case cl::BOU_FALSE:
1542bdd1243dSDimitry Andric       return true;
1543bdd1243dSDimitry Andric     };
1544bdd1243dSDimitry Andric     llvm_unreachable("impossible case value");
1545bdd1243dSDimitry Andric   }
1546bdd1243dSDimitry Andric 
1547bdd1243dSDimitry Andric   /// Returns true if \p I is an instruction which requires predication and
1548bdd1243dSDimitry Andric   /// for which our chosen predication strategy is scalarization (i.e. we
1549bdd1243dSDimitry Andric   /// don't have an alternate strategy such as masking available).
1550bdd1243dSDimitry Andric   /// \p VF is the vectorization factor that will be used to vectorize \p I.
155104eeddc0SDimitry Andric   bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
15520b57cec5SDimitry Andric 
1553bdd1243dSDimitry Andric   /// Returns true if \p I is an instruction that needs to be predicated
1554bdd1243dSDimitry Andric   /// at runtime.  The result is independent of the predication mechanism.
1555bdd1243dSDimitry Andric   /// Superset of instructions that return true for isScalarWithPredication.
1556bdd1243dSDimitry Andric   bool isPredicatedInst(Instruction *I) const;
1557bdd1243dSDimitry Andric 
1558bdd1243dSDimitry Andric   /// Return the costs for our two available strategies for lowering a
1559bdd1243dSDimitry Andric   /// div/rem operation which requires speculating at least one lane.
1560bdd1243dSDimitry Andric   /// First result is for scalarization (will be invalid for scalable
1561bdd1243dSDimitry Andric   /// vectors); second is for the safe-divisor strategy.
1562bdd1243dSDimitry Andric   std::pair<InstructionCost, InstructionCost>
1563bdd1243dSDimitry Andric   getDivRemSpeculationCost(Instruction *I,
1564bdd1243dSDimitry Andric                            ElementCount VF) const;
15650b57cec5SDimitry Andric 
15660b57cec5SDimitry Andric   /// Returns true if \p I is a memory instruction with consecutive memory
15670b57cec5SDimitry Andric   /// access that can be widened.
1568bdd1243dSDimitry Andric   bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
15690b57cec5SDimitry Andric 
15700b57cec5SDimitry Andric   /// Returns true if \p I is a memory instruction in an interleaved-group
15710b57cec5SDimitry Andric   /// of memory accesses that can be vectorized with wide vector loads/stores
15720b57cec5SDimitry Andric   /// and shuffles.
1573bdd1243dSDimitry Andric   bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF);
15740b57cec5SDimitry Andric 
15750b57cec5SDimitry Andric   /// Check if \p Instr belongs to any interleaved access group.
isAccessInterleaved(Instruction * Instr)15760b57cec5SDimitry Andric   bool isAccessInterleaved(Instruction *Instr) {
15770b57cec5SDimitry Andric     return InterleaveInfo.isInterleaved(Instr);
15780b57cec5SDimitry Andric   }
15790b57cec5SDimitry Andric 
15800b57cec5SDimitry Andric   /// Get the interleaved access group that \p Instr belongs to.
15810b57cec5SDimitry Andric   const InterleaveGroup<Instruction> *
getInterleavedAccessGroup(Instruction * Instr)15820b57cec5SDimitry Andric   getInterleavedAccessGroup(Instruction *Instr) {
15830b57cec5SDimitry Andric     return InterleaveInfo.getInterleaveGroup(Instr);
15840b57cec5SDimitry Andric   }
15850b57cec5SDimitry Andric 
1586e8d8bef9SDimitry Andric   /// Returns true if we're required to use a scalar epilogue for at least
1587e8d8bef9SDimitry Andric   /// the final iteration of the original loop.
requiresScalarEpilogue(bool IsVectorizing) const1588fe013be4SDimitry Andric   bool requiresScalarEpilogue(bool IsVectorizing) const {
1589e8d8bef9SDimitry Andric     if (!isScalarEpilogueAllowed())
1590e8d8bef9SDimitry Andric       return false;
1591e8d8bef9SDimitry Andric     // If we might exit from anywhere but the latch, must run the exiting
1592e8d8bef9SDimitry Andric     // iteration in scalar form.
1593e8d8bef9SDimitry Andric     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1594e8d8bef9SDimitry Andric       return true;
1595fe013be4SDimitry Andric     return IsVectorizing && InterleaveInfo.requiresScalarEpilogue();
1596fe013be4SDimitry Andric   }
1597fe013be4SDimitry Andric 
1598fe013be4SDimitry Andric   /// Returns true if we're required to use a scalar epilogue for at least
1599fe013be4SDimitry Andric   /// the final iteration of the original loop for all VFs in \p Range.
1600fe013be4SDimitry Andric   /// A scalar epilogue must either be required for all VFs in \p Range or for
1601fe013be4SDimitry Andric   /// none.
requiresScalarEpilogue(VFRange Range) const1602fe013be4SDimitry Andric   bool requiresScalarEpilogue(VFRange Range) const {
1603fe013be4SDimitry Andric     auto RequiresScalarEpilogue = [this](ElementCount VF) {
1604fe013be4SDimitry Andric       return requiresScalarEpilogue(VF.isVector());
1605fe013be4SDimitry Andric     };
1606fe013be4SDimitry Andric     bool IsRequired = all_of(Range, RequiresScalarEpilogue);
1607fe013be4SDimitry Andric     assert(
1608fe013be4SDimitry Andric         (IsRequired || none_of(Range, RequiresScalarEpilogue)) &&
1609fe013be4SDimitry Andric         "all VFs in range must agree on whether a scalar epilogue is required");
1610fe013be4SDimitry Andric     return IsRequired;
16110b57cec5SDimitry Andric   }
16120b57cec5SDimitry Andric 
16138bcb0991SDimitry Andric   /// Returns true if a scalar epilogue is not allowed due to optsize or a
16148bcb0991SDimitry Andric   /// loop hint annotation.
isScalarEpilogueAllowed() const16158bcb0991SDimitry Andric   bool isScalarEpilogueAllowed() const {
16168bcb0991SDimitry Andric     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
16178bcb0991SDimitry Andric   }
16180b57cec5SDimitry Andric 
1619fe013be4SDimitry Andric   /// Returns the TailFoldingStyle that is best for the current loop.
1620fe013be4SDimitry Andric   TailFoldingStyle
getTailFoldingStyle(bool IVUpdateMayOverflow=true) const1621fe013be4SDimitry Andric   getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1622fe013be4SDimitry Andric     if (!CanFoldTailByMasking)
1623fe013be4SDimitry Andric       return TailFoldingStyle::None;
16240b57cec5SDimitry Andric 
1625fe013be4SDimitry Andric     if (ForceTailFoldingStyle.getNumOccurrences())
1626fe013be4SDimitry Andric       return ForceTailFoldingStyle;
1627fe013be4SDimitry Andric 
1628fe013be4SDimitry Andric     return TTI.getPreferredTailFoldingStyle(IVUpdateMayOverflow);
1629fe013be4SDimitry Andric   }
1630fe013be4SDimitry Andric 
1631fe013be4SDimitry Andric   /// Returns true if all loop blocks should be masked to fold tail loop.
foldTailByMasking() const1632fe013be4SDimitry Andric   bool foldTailByMasking() const {
1633fe013be4SDimitry Andric     return getTailFoldingStyle() != TailFoldingStyle::None;
1634753f127fSDimitry Andric   }
1635753f127fSDimitry Andric 
1636349cc55cSDimitry Andric   /// Returns true if the instructions in this block requires predication
1637349cc55cSDimitry Andric   /// for any reason, e.g. because tail folding now requires a predicate
1638349cc55cSDimitry Andric   /// or because the block in the original loop was predicated.
blockNeedsPredicationForAnyReason(BasicBlock * BB) const1639349cc55cSDimitry Andric   bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
16400b57cec5SDimitry Andric     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
16410b57cec5SDimitry Andric   }
16420b57cec5SDimitry Andric 
1643e8d8bef9SDimitry Andric   /// Returns true if the Phi is part of an inloop reduction.
isInLoopReduction(PHINode * Phi) const1644e8d8bef9SDimitry Andric   bool isInLoopReduction(PHINode *Phi) const {
1645c9157d92SDimitry Andric     return InLoopReductions.contains(Phi);
1646e8d8bef9SDimitry Andric   }
1647e8d8bef9SDimitry Andric 
16480b57cec5SDimitry Andric   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
16490b57cec5SDimitry Andric   /// with factor VF.  Return the cost of the instruction, including
16500b57cec5SDimitry Andric   /// scalarization overhead if it's needed.
1651fe6060f1SDimitry Andric   InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
16520b57cec5SDimitry Andric 
16530b57cec5SDimitry Andric   /// Estimate cost of a call instruction CI if it were vectorized with factor
16540b57cec5SDimitry Andric   /// VF. Return the cost of the instruction, including scalarization overhead
1655c9157d92SDimitry Andric   /// if it's needed.
1656c9157d92SDimitry Andric   InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const;
16570b57cec5SDimitry Andric 
16585ffd83dbSDimitry Andric   /// Invalidates decisions already taken by the cost model.
invalidateCostModelingDecisions()16595ffd83dbSDimitry Andric   void invalidateCostModelingDecisions() {
16605ffd83dbSDimitry Andric     WideningDecisions.clear();
1661c9157d92SDimitry Andric     CallWideningDecisions.clear();
16625ffd83dbSDimitry Andric     Uniforms.clear();
16635ffd83dbSDimitry Andric     Scalars.clear();
16645ffd83dbSDimitry Andric   }
16655ffd83dbSDimitry Andric 
1666fe013be4SDimitry Andric   /// The vectorization cost is a combination of the cost itself and a boolean
1667fe013be4SDimitry Andric   /// indicating whether any of the contributing operations will actually
1668fe013be4SDimitry Andric   /// operate on vector values after type legalization in the backend. If this
1669fe013be4SDimitry Andric   /// latter value is false, then all operations will be scalarized (i.e. no
1670fe013be4SDimitry Andric   /// vectorization has actually taken place).
1671fe013be4SDimitry Andric   using VectorizationCostTy = std::pair<InstructionCost, bool>;
1672fe013be4SDimitry Andric 
1673fe013be4SDimitry Andric   /// Returns the expected execution cost. The unit of the cost does
1674fe013be4SDimitry Andric   /// not matter because we use the 'cost' units to compare different
1675fe013be4SDimitry Andric   /// vector widths. The cost that is returned is *not* normalized by
1676fe013be4SDimitry Andric   /// the factor width. If \p Invalid is not nullptr, this function
1677fe013be4SDimitry Andric   /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1678fe013be4SDimitry Andric   /// each instruction that has an Invalid cost for the given VF.
1679fe013be4SDimitry Andric   VectorizationCostTy
1680fe013be4SDimitry Andric   expectedCost(ElementCount VF,
1681fe013be4SDimitry Andric                SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
1682fe013be4SDimitry Andric 
hasPredStores() const1683fe013be4SDimitry Andric   bool hasPredStores() const { return NumPredStores > 0; }
1684fe013be4SDimitry Andric 
1685fe013be4SDimitry Andric   /// Returns true if epilogue vectorization is considered profitable, and
1686fe013be4SDimitry Andric   /// false otherwise.
1687fe013be4SDimitry Andric   /// \p VF is the vectorization factor chosen for the original loop.
1688fe013be4SDimitry Andric   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1689d56accc7SDimitry Andric 
1690753f127fSDimitry Andric private:
1691753f127fSDimitry Andric   unsigned NumPredStores = 0;
1692753f127fSDimitry Andric 
1693fe6060f1SDimitry Andric   /// \return An upper bound for the vectorization factors for both
1694fe6060f1SDimitry Andric   /// fixed and scalable vectorization, where the minimum-known number of
1695fe6060f1SDimitry Andric   /// elements is a power-of-2 larger than zero. If scalable vectorization is
1696fe6060f1SDimitry Andric   /// disabled or unsupported, then the scalable part will be equal to
1697fe6060f1SDimitry Andric   /// ElementCount::getScalable(0).
1698c9157d92SDimitry Andric   FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
16990eae32dcSDimitry Andric                                            ElementCount UserVF,
17000eae32dcSDimitry Andric                                            bool FoldTailByMasking);
17010b57cec5SDimitry Andric 
1702fe6060f1SDimitry Andric   /// \return the maximized element count based on the targets vector
1703fe6060f1SDimitry Andric   /// registers and the loop trip-count, but limited to a maximum safe VF.
1704fe6060f1SDimitry Andric   /// This is a helper function of computeFeasibleMaxVF.
1705c9157d92SDimitry Andric   ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1706fe6060f1SDimitry Andric                                        unsigned SmallestType,
1707fe6060f1SDimitry Andric                                        unsigned WidestType,
170881ad6265SDimitry Andric                                        ElementCount MaxSafeVF,
17090eae32dcSDimitry Andric                                        bool FoldTailByMasking);
1710fe6060f1SDimitry Andric 
1711fe6060f1SDimitry Andric   /// \return the maximum legal scalable VF, based on the safe max number
1712fe6060f1SDimitry Andric   /// of elements.
1713fe6060f1SDimitry Andric   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1714fe6060f1SDimitry Andric 
17150b57cec5SDimitry Andric   /// Returns the execution time cost of an instruction for a given vector
17160b57cec5SDimitry Andric   /// width. Vector width of one means scalar.
1717e8d8bef9SDimitry Andric   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
17180b57cec5SDimitry Andric 
17190b57cec5SDimitry Andric   /// The cost-computation logic from getInstructionCost which provides
17200b57cec5SDimitry Andric   /// the vector type as an output parameter.
1721e8d8bef9SDimitry Andric   InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1722e8d8bef9SDimitry Andric                                      Type *&VectorTy);
1723e8d8bef9SDimitry Andric 
1724e8d8bef9SDimitry Andric   /// Return the cost of instructions in an inloop reduction pattern, if I is
1725e8d8bef9SDimitry Andric   /// part of that pattern.
1726bdd1243dSDimitry Andric   std::optional<InstructionCost>
1727fe6060f1SDimitry Andric   getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1728c9157d92SDimitry Andric                           TTI::TargetCostKind CostKind) const;
17290b57cec5SDimitry Andric 
17300b57cec5SDimitry Andric   /// Calculate vectorization cost of memory instruction \p I.
1731e8d8bef9SDimitry Andric   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
17320b57cec5SDimitry Andric 
17330b57cec5SDimitry Andric   /// The cost computation for scalarized memory instruction.
1734e8d8bef9SDimitry Andric   InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
17350b57cec5SDimitry Andric 
17360b57cec5SDimitry Andric   /// The cost computation for interleaving group of memory instructions.
1737e8d8bef9SDimitry Andric   InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
17380b57cec5SDimitry Andric 
17390b57cec5SDimitry Andric   /// The cost computation for Gather/Scatter instruction.
1740e8d8bef9SDimitry Andric   InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
17410b57cec5SDimitry Andric 
17420b57cec5SDimitry Andric   /// The cost computation for widening instruction \p I with consecutive
17430b57cec5SDimitry Andric   /// memory access.
1744e8d8bef9SDimitry Andric   InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
17450b57cec5SDimitry Andric 
17460b57cec5SDimitry Andric   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
17470b57cec5SDimitry Andric   /// Load: scalar load + broadcast.
17480b57cec5SDimitry Andric   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
17490b57cec5SDimitry Andric   /// element)
1750e8d8bef9SDimitry Andric   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
17510b57cec5SDimitry Andric 
17520b57cec5SDimitry Andric   /// Estimate the overhead of scalarizing an instruction. This is a
17530b57cec5SDimitry Andric   /// convenience wrapper for the type-based getScalarizationOverhead API.
1754bdd1243dSDimitry Andric   InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF,
1755bdd1243dSDimitry Andric                                            TTI::TargetCostKind CostKind) const;
17560b57cec5SDimitry Andric 
17570b57cec5SDimitry Andric   /// Returns true if an artificially high cost for emulated masked memrefs
17580b57cec5SDimitry Andric   /// should be used.
175904eeddc0SDimitry Andric   bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
17600b57cec5SDimitry Andric 
17610b57cec5SDimitry Andric   /// Map of scalar integer values to the smallest bitwidth they can be legally
17620b57cec5SDimitry Andric   /// represented as. The vector equivalents of these values should be truncated
17630b57cec5SDimitry Andric   /// to this type.
17640b57cec5SDimitry Andric   MapVector<Instruction *, uint64_t> MinBWs;
17650b57cec5SDimitry Andric 
17660b57cec5SDimitry Andric   /// A type representing the costs for instructions if they were to be
17670b57cec5SDimitry Andric   /// scalarized rather than vectorized. The entries are Instruction-Cost
17680b57cec5SDimitry Andric   /// pairs.
1769e8d8bef9SDimitry Andric   using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
17700b57cec5SDimitry Andric 
17710b57cec5SDimitry Andric   /// A set containing all BasicBlocks that are known to present after
17720b57cec5SDimitry Andric   /// vectorization as a predicated block.
1773753f127fSDimitry Andric   DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
1774753f127fSDimitry Andric       PredicatedBBsAfterVectorization;
17750b57cec5SDimitry Andric 
17760b57cec5SDimitry Andric   /// Records whether it is allowed to have the original scalar loop execute at
17770b57cec5SDimitry Andric   /// least once. This may be needed as a fallback loop in case runtime
17780b57cec5SDimitry Andric   /// aliasing/dependence checks fail, or to handle the tail/remainder
17790b57cec5SDimitry Andric   /// iterations when the trip count is unknown or doesn't divide by the VF,
17800b57cec5SDimitry Andric   /// or as a peel-loop to handle gaps in interleave-groups.
17810b57cec5SDimitry Andric   /// Under optsize and when the trip count is very small we don't allow any
17820b57cec5SDimitry Andric   /// iterations to execute in the scalar loop.
17838bcb0991SDimitry Andric   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
17840b57cec5SDimitry Andric 
17850b57cec5SDimitry Andric   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1786fe013be4SDimitry Andric   bool CanFoldTailByMasking = false;
17870b57cec5SDimitry Andric 
17880b57cec5SDimitry Andric   /// A map holding scalar costs for different vectorization factors. The
17890b57cec5SDimitry Andric   /// presence of a cost for an instruction in the mapping indicates that the
17900b57cec5SDimitry Andric   /// instruction will be scalarized when vectorizing with the associated
17910b57cec5SDimitry Andric   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1792e8d8bef9SDimitry Andric   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
17930b57cec5SDimitry Andric 
17940b57cec5SDimitry Andric   /// Holds the instructions known to be uniform after vectorization.
17950b57cec5SDimitry Andric   /// The data is collected per VF.
1796e8d8bef9SDimitry Andric   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
17970b57cec5SDimitry Andric 
17980b57cec5SDimitry Andric   /// Holds the instructions known to be scalar after vectorization.
17990b57cec5SDimitry Andric   /// The data is collected per VF.
1800e8d8bef9SDimitry Andric   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
18010b57cec5SDimitry Andric 
18020b57cec5SDimitry Andric   /// Holds the instructions (address computations) that are forced to be
18030b57cec5SDimitry Andric   /// scalarized.
1804e8d8bef9SDimitry Andric   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1805e8d8bef9SDimitry Andric 
1806c9157d92SDimitry Andric   /// PHINodes of the reductions that should be expanded in-loop.
1807c9157d92SDimitry Andric   SmallPtrSet<PHINode *, 4> InLoopReductions;
1808e8d8bef9SDimitry Andric 
1809e8d8bef9SDimitry Andric   /// A Map of inloop reduction operations and their immediate chain operand.
1810e8d8bef9SDimitry Andric   /// FIXME: This can be removed once reductions can be costed correctly in
1811c9157d92SDimitry Andric   /// VPlan. This was added to allow quick lookup of the inloop operations.
1812e8d8bef9SDimitry Andric   DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
18130b57cec5SDimitry Andric 
18140b57cec5SDimitry Andric   /// Returns the expected difference in cost from scalarizing the expression
18150b57cec5SDimitry Andric   /// feeding a predicated instruction \p PredInst. The instructions to
18160b57cec5SDimitry Andric   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
18170b57cec5SDimitry Andric   /// non-negative return value implies the expression will be scalarized.
18180b57cec5SDimitry Andric   /// Currently, only single-use chains are considered for scalarization.
1819bdd1243dSDimitry Andric   InstructionCost computePredInstDiscount(Instruction *PredInst,
1820bdd1243dSDimitry Andric                                           ScalarCostsTy &ScalarCosts,
1821e8d8bef9SDimitry Andric                                           ElementCount VF);
18220b57cec5SDimitry Andric 
18230b57cec5SDimitry Andric   /// Collect the instructions that are uniform after vectorization. An
18240b57cec5SDimitry Andric   /// instruction is uniform if we represent it with a single scalar value in
18250b57cec5SDimitry Andric   /// the vectorized loop corresponding to each vector iteration. Examples of
18260b57cec5SDimitry Andric   /// uniform instructions include pointer operands of consecutive or
18270b57cec5SDimitry Andric   /// interleaved memory accesses. Note that although uniformity implies an
18280b57cec5SDimitry Andric   /// instruction will be scalar, the reverse is not true. In general, a
18290b57cec5SDimitry Andric   /// scalarized instruction will be represented by VF scalar values in the
18300b57cec5SDimitry Andric   /// vectorized loop, each corresponding to an iteration of the original
18310b57cec5SDimitry Andric   /// scalar loop.
1832e8d8bef9SDimitry Andric   void collectLoopUniforms(ElementCount VF);
18330b57cec5SDimitry Andric 
18340b57cec5SDimitry Andric   /// Collect the instructions that are scalar after vectorization. An
18350b57cec5SDimitry Andric   /// instruction is scalar if it is known to be uniform or will be scalarized
18364824e7fdSDimitry Andric   /// during vectorization. collectLoopScalars should only add non-uniform nodes
18374824e7fdSDimitry Andric   /// to the list if they are used by a load/store instruction that is marked as
18384824e7fdSDimitry Andric   /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
18394824e7fdSDimitry Andric   /// VF values in the vectorized loop, each corresponding to an iteration of
18404824e7fdSDimitry Andric   /// the original scalar loop.
1841e8d8bef9SDimitry Andric   void collectLoopScalars(ElementCount VF);
18420b57cec5SDimitry Andric 
18430b57cec5SDimitry Andric   /// Keeps cost model vectorization decision and cost for instructions.
18440b57cec5SDimitry Andric   /// Right now it is used for memory instructions only.
1845e8d8bef9SDimitry Andric   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1846e8d8bef9SDimitry Andric                                 std::pair<InstWidening, InstructionCost>>;
18470b57cec5SDimitry Andric 
18480b57cec5SDimitry Andric   DecisionList WideningDecisions;
18490b57cec5SDimitry Andric 
1850c9157d92SDimitry Andric   using CallDecisionList =
1851c9157d92SDimitry Andric       DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1852c9157d92SDimitry Andric 
1853c9157d92SDimitry Andric   CallDecisionList CallWideningDecisions;
1854c9157d92SDimitry Andric 
18550b57cec5SDimitry Andric   /// Returns true if \p V is expected to be vectorized and it needs to be
18560b57cec5SDimitry Andric   /// extracted.
needsExtract(Value * V,ElementCount VF) const1857e8d8bef9SDimitry Andric   bool needsExtract(Value *V, ElementCount VF) const {
18580b57cec5SDimitry Andric     Instruction *I = dyn_cast<Instruction>(V);
1859e8d8bef9SDimitry Andric     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1860e8d8bef9SDimitry Andric         TheLoop->isLoopInvariant(I))
18610b57cec5SDimitry Andric       return false;
18620b57cec5SDimitry Andric 
18630b57cec5SDimitry Andric     // Assume we can vectorize V (and hence we need extraction) if the
18640b57cec5SDimitry Andric     // scalars are not computed yet. This can happen, because it is called
18650b57cec5SDimitry Andric     // via getScalarizationOverhead from setCostBasedWideningDecision, before
18660b57cec5SDimitry Andric     // the scalars are collected. That should be a safe assumption in most
18670b57cec5SDimitry Andric     // cases, because we check if the operands have vectorizable types
18680b57cec5SDimitry Andric     // beforehand in LoopVectorizationLegality.
1869fe013be4SDimitry Andric     return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
18700b57cec5SDimitry Andric   };
18710b57cec5SDimitry Andric 
18720b57cec5SDimitry Andric   /// Returns a range containing only operands needing to be extracted.
filterExtractingOperands(Instruction::op_range Ops,ElementCount VF) const18730b57cec5SDimitry Andric   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1874fe6060f1SDimitry Andric                                                    ElementCount VF) const {
18750b57cec5SDimitry Andric     return SmallVector<Value *, 4>(make_filter_range(
18760b57cec5SDimitry Andric         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
18770b57cec5SDimitry Andric   }
18780b57cec5SDimitry Andric 
18790b57cec5SDimitry Andric public:
18800b57cec5SDimitry Andric   /// The loop that we evaluate.
18810b57cec5SDimitry Andric   Loop *TheLoop;
18820b57cec5SDimitry Andric 
18830b57cec5SDimitry Andric   /// Predicated scalar evolution analysis.
18840b57cec5SDimitry Andric   PredicatedScalarEvolution &PSE;
18850b57cec5SDimitry Andric 
18860b57cec5SDimitry Andric   /// Loop Info analysis.
18870b57cec5SDimitry Andric   LoopInfo *LI;
18880b57cec5SDimitry Andric 
18890b57cec5SDimitry Andric   /// Vectorization legality.
18900b57cec5SDimitry Andric   LoopVectorizationLegality *Legal;
18910b57cec5SDimitry Andric 
18920b57cec5SDimitry Andric   /// Vector target information.
18930b57cec5SDimitry Andric   const TargetTransformInfo &TTI;
18940b57cec5SDimitry Andric 
18950b57cec5SDimitry Andric   /// Target Library Info.
18960b57cec5SDimitry Andric   const TargetLibraryInfo *TLI;
18970b57cec5SDimitry Andric 
18980b57cec5SDimitry Andric   /// Demanded bits analysis.
18990b57cec5SDimitry Andric   DemandedBits *DB;
19000b57cec5SDimitry Andric 
19010b57cec5SDimitry Andric   /// Assumption cache.
19020b57cec5SDimitry Andric   AssumptionCache *AC;
19030b57cec5SDimitry Andric 
19040b57cec5SDimitry Andric   /// Interface to emit optimization remarks.
19050b57cec5SDimitry Andric   OptimizationRemarkEmitter *ORE;
19060b57cec5SDimitry Andric 
19070b57cec5SDimitry Andric   const Function *TheFunction;
19080b57cec5SDimitry Andric 
19090b57cec5SDimitry Andric   /// Loop Vectorize Hint.
19100b57cec5SDimitry Andric   const LoopVectorizeHints *Hints;
19110b57cec5SDimitry Andric 
19120b57cec5SDimitry Andric   /// The interleave access information contains groups of interleaved accesses
19130b57cec5SDimitry Andric   /// with the same stride and close to each other.
19140b57cec5SDimitry Andric   InterleavedAccessInfo &InterleaveInfo;
19150b57cec5SDimitry Andric 
19160b57cec5SDimitry Andric   /// Values to ignore in the cost model.
19170b57cec5SDimitry Andric   SmallPtrSet<const Value *, 16> ValuesToIgnore;
19180b57cec5SDimitry Andric 
19190b57cec5SDimitry Andric   /// Values to ignore in the cost model when VF > 1.
19200b57cec5SDimitry Andric   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1921e8d8bef9SDimitry Andric 
1922fe6060f1SDimitry Andric   /// All element types found in the loop.
1923fe6060f1SDimitry Andric   SmallPtrSet<Type *, 16> ElementTypesInLoop;
19240b57cec5SDimitry Andric };
19250b57cec5SDimitry Andric } // end namespace llvm
19260b57cec5SDimitry Andric 
1927bdd1243dSDimitry Andric namespace {
1928fe6060f1SDimitry Andric /// Helper struct to manage generating runtime checks for vectorization.
1929fe6060f1SDimitry Andric ///
1930fe6060f1SDimitry Andric /// The runtime checks are created up-front in temporary blocks to allow better
1931fe6060f1SDimitry Andric /// estimating the cost and un-linked from the existing IR. After deciding to
1932fe6060f1SDimitry Andric /// vectorize, the checks are moved back. If deciding not to vectorize, the
1933fe6060f1SDimitry Andric /// temporary blocks are completely removed.
1934fe6060f1SDimitry Andric class GeneratedRTChecks {
1935fe6060f1SDimitry Andric   /// Basic block which contains the generated SCEV checks, if any.
1936fe6060f1SDimitry Andric   BasicBlock *SCEVCheckBlock = nullptr;
1937fe6060f1SDimitry Andric 
1938fe6060f1SDimitry Andric   /// The value representing the result of the generated SCEV checks. If it is
1939fe6060f1SDimitry Andric   /// nullptr, either no SCEV checks have been generated or they have been used.
1940fe6060f1SDimitry Andric   Value *SCEVCheckCond = nullptr;
1941fe6060f1SDimitry Andric 
1942fe6060f1SDimitry Andric   /// Basic block which contains the generated memory runtime checks, if any.
1943fe6060f1SDimitry Andric   BasicBlock *MemCheckBlock = nullptr;
1944fe6060f1SDimitry Andric 
1945fe6060f1SDimitry Andric   /// The value representing the result of the generated memory runtime checks.
1946fe6060f1SDimitry Andric   /// If it is nullptr, either no memory runtime checks have been generated or
1947fe6060f1SDimitry Andric   /// they have been used.
1948349cc55cSDimitry Andric   Value *MemRuntimeCheckCond = nullptr;
1949fe6060f1SDimitry Andric 
1950fe6060f1SDimitry Andric   DominatorTree *DT;
1951fe6060f1SDimitry Andric   LoopInfo *LI;
1952753f127fSDimitry Andric   TargetTransformInfo *TTI;
1953fe6060f1SDimitry Andric 
1954fe6060f1SDimitry Andric   SCEVExpander SCEVExp;
1955fe6060f1SDimitry Andric   SCEVExpander MemCheckExp;
1956fe6060f1SDimitry Andric 
1957753f127fSDimitry Andric   bool CostTooHigh = false;
1958c9157d92SDimitry Andric   const bool AddBranchWeights;
1959753f127fSDimitry Andric 
1960*b9d9368bSDimitry Andric   Loop *OuterLoop = nullptr;
1961*b9d9368bSDimitry Andric 
1962fe6060f1SDimitry Andric public:
GeneratedRTChecks(ScalarEvolution & SE,DominatorTree * DT,LoopInfo * LI,TargetTransformInfo * TTI,const DataLayout & DL,bool AddBranchWeights)1963fe6060f1SDimitry Andric   GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1964c9157d92SDimitry Andric                     TargetTransformInfo *TTI, const DataLayout &DL,
1965c9157d92SDimitry Andric                     bool AddBranchWeights)
1966753f127fSDimitry Andric       : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),
1967c9157d92SDimitry Andric         MemCheckExp(SE, DL, "scev.check"), AddBranchWeights(AddBranchWeights) {}
1968fe6060f1SDimitry Andric 
1969fe6060f1SDimitry Andric   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1970fe6060f1SDimitry Andric   /// accurately estimate the cost of the runtime checks. The blocks are
1971fe6060f1SDimitry Andric   /// un-linked from the IR and is added back during vector code generation. If
1972fe6060f1SDimitry Andric   /// there is no vector code generation, the check blocks are removed
1973fe6060f1SDimitry Andric   /// completely.
Create(Loop * L,const LoopAccessInfo & LAI,const SCEVPredicate & UnionPred,ElementCount VF,unsigned IC)1974fe6060f1SDimitry Andric   void Create(Loop *L, const LoopAccessInfo &LAI,
197581ad6265SDimitry Andric               const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1976fe6060f1SDimitry Andric 
1977753f127fSDimitry Andric     // Hard cutoff to limit compile-time increase in case a very large number of
1978753f127fSDimitry Andric     // runtime checks needs to be generated.
1979753f127fSDimitry Andric     // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1980753f127fSDimitry Andric     // profile info.
1981753f127fSDimitry Andric     CostTooHigh =
1982753f127fSDimitry Andric         LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
1983753f127fSDimitry Andric     if (CostTooHigh)
1984753f127fSDimitry Andric       return;
1985753f127fSDimitry Andric 
1986fe6060f1SDimitry Andric     BasicBlock *LoopHeader = L->getHeader();
1987fe6060f1SDimitry Andric     BasicBlock *Preheader = L->getLoopPreheader();
1988fe6060f1SDimitry Andric 
1989fe6060f1SDimitry Andric     // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1990fe6060f1SDimitry Andric     // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1991fe6060f1SDimitry Andric     // may be used by SCEVExpander. The blocks will be un-linked from their
1992fe6060f1SDimitry Andric     // predecessors and removed from LI & DT at the end of the function.
1993fe6060f1SDimitry Andric     if (!UnionPred.isAlwaysTrue()) {
1994fe6060f1SDimitry Andric       SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1995fe6060f1SDimitry Andric                                   nullptr, "vector.scevcheck");
1996fe6060f1SDimitry Andric 
1997fe6060f1SDimitry Andric       SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1998fe6060f1SDimitry Andric           &UnionPred, SCEVCheckBlock->getTerminator());
1999fe6060f1SDimitry Andric     }
2000fe6060f1SDimitry Andric 
2001fe6060f1SDimitry Andric     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
2002fe6060f1SDimitry Andric     if (RtPtrChecking.Need) {
2003fe6060f1SDimitry Andric       auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
2004fe6060f1SDimitry Andric       MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
2005fe6060f1SDimitry Andric                                  "vector.memcheck");
2006fe6060f1SDimitry Andric 
200781ad6265SDimitry Andric       auto DiffChecks = RtPtrChecking.getDiffChecks();
200881ad6265SDimitry Andric       if (DiffChecks) {
2009fcaf7f86SDimitry Andric         Value *RuntimeVF = nullptr;
201081ad6265SDimitry Andric         MemRuntimeCheckCond = addDiffRuntimeChecks(
2011bdd1243dSDimitry Andric             MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
2012fcaf7f86SDimitry Andric             [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
2013fcaf7f86SDimitry Andric               if (!RuntimeVF)
2014fcaf7f86SDimitry Andric                 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
2015fcaf7f86SDimitry Andric               return RuntimeVF;
201681ad6265SDimitry Andric             },
201781ad6265SDimitry Andric             IC);
201881ad6265SDimitry Andric       } else {
2019c9157d92SDimitry Andric         MemRuntimeCheckCond = addRuntimeChecks(
2020c9157d92SDimitry Andric             MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
2021c9157d92SDimitry Andric             MemCheckExp, VectorizerParams::HoistRuntimeChecks);
202281ad6265SDimitry Andric       }
2023fe6060f1SDimitry Andric       assert(MemRuntimeCheckCond &&
2024fe6060f1SDimitry Andric              "no RT checks generated although RtPtrChecking "
2025fe6060f1SDimitry Andric              "claimed checks are required");
2026fe6060f1SDimitry Andric     }
2027fe6060f1SDimitry Andric 
2028fe6060f1SDimitry Andric     if (!MemCheckBlock && !SCEVCheckBlock)
2029fe6060f1SDimitry Andric       return;
2030fe6060f1SDimitry Andric 
2031fe6060f1SDimitry Andric     // Unhook the temporary block with the checks, update various places
2032fe6060f1SDimitry Andric     // accordingly.
2033fe6060f1SDimitry Andric     if (SCEVCheckBlock)
2034fe6060f1SDimitry Andric       SCEVCheckBlock->replaceAllUsesWith(Preheader);
2035fe6060f1SDimitry Andric     if (MemCheckBlock)
2036fe6060f1SDimitry Andric       MemCheckBlock->replaceAllUsesWith(Preheader);
2037fe6060f1SDimitry Andric 
2038fe6060f1SDimitry Andric     if (SCEVCheckBlock) {
2039fe6060f1SDimitry Andric       SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2040fe6060f1SDimitry Andric       new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
2041fe6060f1SDimitry Andric       Preheader->getTerminator()->eraseFromParent();
2042fe6060f1SDimitry Andric     }
2043fe6060f1SDimitry Andric     if (MemCheckBlock) {
2044fe6060f1SDimitry Andric       MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2045fe6060f1SDimitry Andric       new UnreachableInst(Preheader->getContext(), MemCheckBlock);
2046fe6060f1SDimitry Andric       Preheader->getTerminator()->eraseFromParent();
2047fe6060f1SDimitry Andric     }
2048fe6060f1SDimitry Andric 
2049fe6060f1SDimitry Andric     DT->changeImmediateDominator(LoopHeader, Preheader);
2050fe6060f1SDimitry Andric     if (MemCheckBlock) {
2051fe6060f1SDimitry Andric       DT->eraseNode(MemCheckBlock);
2052fe6060f1SDimitry Andric       LI->removeBlock(MemCheckBlock);
2053fe6060f1SDimitry Andric     }
2054fe6060f1SDimitry Andric     if (SCEVCheckBlock) {
2055fe6060f1SDimitry Andric       DT->eraseNode(SCEVCheckBlock);
2056fe6060f1SDimitry Andric       LI->removeBlock(SCEVCheckBlock);
2057fe6060f1SDimitry Andric     }
2058*b9d9368bSDimitry Andric 
2059*b9d9368bSDimitry Andric     // Outer loop is used as part of the later cost calculations.
2060*b9d9368bSDimitry Andric     OuterLoop = L->getParentLoop();
2061fe6060f1SDimitry Andric   }
2062fe6060f1SDimitry Andric 
getCost()2063753f127fSDimitry Andric   InstructionCost getCost() {
2064753f127fSDimitry Andric     if (SCEVCheckBlock || MemCheckBlock)
2065753f127fSDimitry Andric       LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
2066753f127fSDimitry Andric 
2067753f127fSDimitry Andric     if (CostTooHigh) {
2068753f127fSDimitry Andric       InstructionCost Cost;
2069753f127fSDimitry Andric       Cost.setInvalid();
2070753f127fSDimitry Andric       LLVM_DEBUG(dbgs() << "  number of checks exceeded threshold\n");
2071753f127fSDimitry Andric       return Cost;
2072753f127fSDimitry Andric     }
2073753f127fSDimitry Andric 
2074753f127fSDimitry Andric     InstructionCost RTCheckCost = 0;
2075753f127fSDimitry Andric     if (SCEVCheckBlock)
2076753f127fSDimitry Andric       for (Instruction &I : *SCEVCheckBlock) {
2077753f127fSDimitry Andric         if (SCEVCheckBlock->getTerminator() == &I)
2078753f127fSDimitry Andric           continue;
2079753f127fSDimitry Andric         InstructionCost C =
2080753f127fSDimitry Andric             TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
2081753f127fSDimitry Andric         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
2082753f127fSDimitry Andric         RTCheckCost += C;
2083753f127fSDimitry Andric       }
2084*b9d9368bSDimitry Andric     if (MemCheckBlock) {
2085*b9d9368bSDimitry Andric       InstructionCost MemCheckCost = 0;
2086753f127fSDimitry Andric       for (Instruction &I : *MemCheckBlock) {
2087753f127fSDimitry Andric         if (MemCheckBlock->getTerminator() == &I)
2088753f127fSDimitry Andric           continue;
2089753f127fSDimitry Andric         InstructionCost C =
2090753f127fSDimitry Andric             TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
2091753f127fSDimitry Andric         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
2092*b9d9368bSDimitry Andric         MemCheckCost += C;
2093*b9d9368bSDimitry Andric       }
2094*b9d9368bSDimitry Andric 
2095*b9d9368bSDimitry Andric       // If the runtime memory checks are being created inside an outer loop
2096*b9d9368bSDimitry Andric       // we should find out if these checks are outer loop invariant. If so,
2097*b9d9368bSDimitry Andric       // the checks will likely be hoisted out and so the effective cost will
2098*b9d9368bSDimitry Andric       // reduce according to the outer loop trip count.
2099*b9d9368bSDimitry Andric       if (OuterLoop) {
2100*b9d9368bSDimitry Andric         ScalarEvolution *SE = MemCheckExp.getSE();
2101*b9d9368bSDimitry Andric         // TODO: If profitable, we could refine this further by analysing every
2102*b9d9368bSDimitry Andric         // individual memory check, since there could be a mixture of loop
2103*b9d9368bSDimitry Andric         // variant and invariant checks that mean the final condition is
2104*b9d9368bSDimitry Andric         // variant.
2105*b9d9368bSDimitry Andric         const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
2106*b9d9368bSDimitry Andric         if (SE->isLoopInvariant(Cond, OuterLoop)) {
2107*b9d9368bSDimitry Andric           // It seems reasonable to assume that we can reduce the effective
2108*b9d9368bSDimitry Andric           // cost of the checks even when we know nothing about the trip
2109*b9d9368bSDimitry Andric           // count. Assume that the outer loop executes at least twice.
2110*b9d9368bSDimitry Andric           unsigned BestTripCount = 2;
2111*b9d9368bSDimitry Andric 
2112*b9d9368bSDimitry Andric           // If exact trip count is known use that.
2113*b9d9368bSDimitry Andric           if (unsigned SmallTC = SE->getSmallConstantTripCount(OuterLoop))
2114*b9d9368bSDimitry Andric             BestTripCount = SmallTC;
2115*b9d9368bSDimitry Andric           else if (LoopVectorizeWithBlockFrequency) {
2116*b9d9368bSDimitry Andric             // Else use profile data if available.
2117*b9d9368bSDimitry Andric             if (auto EstimatedTC = getLoopEstimatedTripCount(OuterLoop))
2118*b9d9368bSDimitry Andric               BestTripCount = *EstimatedTC;
2119*b9d9368bSDimitry Andric           }
2120*b9d9368bSDimitry Andric 
2121*b9d9368bSDimitry Andric           InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
2122*b9d9368bSDimitry Andric 
2123*b9d9368bSDimitry Andric           // Let's ensure the cost is always at least 1.
2124*b9d9368bSDimitry Andric           NewMemCheckCost = std::max(*NewMemCheckCost.getValue(),
2125*b9d9368bSDimitry Andric                                      (InstructionCost::CostType)1);
2126*b9d9368bSDimitry Andric 
2127*b9d9368bSDimitry Andric           LLVM_DEBUG(dbgs()
2128*b9d9368bSDimitry Andric                      << "We expect runtime memory checks to be hoisted "
2129*b9d9368bSDimitry Andric                      << "out of the outer loop. Cost reduced from "
2130*b9d9368bSDimitry Andric                      << MemCheckCost << " to " << NewMemCheckCost << '\n');
2131*b9d9368bSDimitry Andric 
2132*b9d9368bSDimitry Andric           MemCheckCost = NewMemCheckCost;
2133*b9d9368bSDimitry Andric         }
2134*b9d9368bSDimitry Andric       }
2135*b9d9368bSDimitry Andric 
2136*b9d9368bSDimitry Andric       RTCheckCost += MemCheckCost;
2137753f127fSDimitry Andric     }
2138753f127fSDimitry Andric 
2139753f127fSDimitry Andric     if (SCEVCheckBlock || MemCheckBlock)
2140753f127fSDimitry Andric       LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2141753f127fSDimitry Andric                         << "\n");
2142753f127fSDimitry Andric 
2143753f127fSDimitry Andric     return RTCheckCost;
2144753f127fSDimitry Andric   }
2145753f127fSDimitry Andric 
2146fe6060f1SDimitry Andric   /// Remove the created SCEV & memory runtime check blocks & instructions, if
2147fe6060f1SDimitry Andric   /// unused.
~GeneratedRTChecks()2148fe6060f1SDimitry Andric   ~GeneratedRTChecks() {
214904eeddc0SDimitry Andric     SCEVExpanderCleaner SCEVCleaner(SCEVExp);
215004eeddc0SDimitry Andric     SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2151fe6060f1SDimitry Andric     if (!SCEVCheckCond)
2152fe6060f1SDimitry Andric       SCEVCleaner.markResultUsed();
2153fe6060f1SDimitry Andric 
2154fe6060f1SDimitry Andric     if (!MemRuntimeCheckCond)
2155fe6060f1SDimitry Andric       MemCheckCleaner.markResultUsed();
2156fe6060f1SDimitry Andric 
2157fe6060f1SDimitry Andric     if (MemRuntimeCheckCond) {
2158fe6060f1SDimitry Andric       auto &SE = *MemCheckExp.getSE();
2159fe6060f1SDimitry Andric       // Memory runtime check generation creates compares that use expanded
2160fe6060f1SDimitry Andric       // values. Remove them before running the SCEVExpanderCleaners.
2161fe6060f1SDimitry Andric       for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2162fe6060f1SDimitry Andric         if (MemCheckExp.isInsertedInstruction(&I))
2163fe6060f1SDimitry Andric           continue;
2164fe6060f1SDimitry Andric         SE.forgetValue(&I);
2165fe6060f1SDimitry Andric         I.eraseFromParent();
2166fe6060f1SDimitry Andric       }
2167fe6060f1SDimitry Andric     }
2168fe6060f1SDimitry Andric     MemCheckCleaner.cleanup();
2169fe6060f1SDimitry Andric     SCEVCleaner.cleanup();
2170fe6060f1SDimitry Andric 
2171fe6060f1SDimitry Andric     if (SCEVCheckCond)
2172fe6060f1SDimitry Andric       SCEVCheckBlock->eraseFromParent();
2173fe6060f1SDimitry Andric     if (MemRuntimeCheckCond)
2174fe6060f1SDimitry Andric       MemCheckBlock->eraseFromParent();
2175fe6060f1SDimitry Andric   }
2176fe6060f1SDimitry Andric 
2177fe6060f1SDimitry Andric   /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2178fe6060f1SDimitry Andric   /// adjusts the branches to branch to the vector preheader or \p Bypass,
2179fe6060f1SDimitry Andric   /// depending on the generated condition.
emitSCEVChecks(BasicBlock * Bypass,BasicBlock * LoopVectorPreHeader,BasicBlock * LoopExitBlock)218081ad6265SDimitry Andric   BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2181fe6060f1SDimitry Andric                              BasicBlock *LoopVectorPreHeader,
2182fe6060f1SDimitry Andric                              BasicBlock *LoopExitBlock) {
2183fe6060f1SDimitry Andric     if (!SCEVCheckCond)
2184fe6060f1SDimitry Andric       return nullptr;
218581ad6265SDimitry Andric 
218681ad6265SDimitry Andric     Value *Cond = SCEVCheckCond;
218781ad6265SDimitry Andric     // Mark the check as used, to prevent it from being removed during cleanup.
218881ad6265SDimitry Andric     SCEVCheckCond = nullptr;
218981ad6265SDimitry Andric     if (auto *C = dyn_cast<ConstantInt>(Cond))
2190fe6060f1SDimitry Andric       if (C->isZero())
2191fe6060f1SDimitry Andric         return nullptr;
2192fe6060f1SDimitry Andric 
2193fe6060f1SDimitry Andric     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2194fe6060f1SDimitry Andric 
2195fe6060f1SDimitry Andric     BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2196fe6060f1SDimitry Andric     // Create new preheader for vector loop.
2197*b9d9368bSDimitry Andric     if (OuterLoop)
2198*b9d9368bSDimitry Andric       OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2199fe6060f1SDimitry Andric 
2200fe6060f1SDimitry Andric     SCEVCheckBlock->getTerminator()->eraseFromParent();
2201fe6060f1SDimitry Andric     SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2202fe6060f1SDimitry Andric     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2203fe6060f1SDimitry Andric                                                 SCEVCheckBlock);
2204fe6060f1SDimitry Andric 
2205fe6060f1SDimitry Andric     DT->addNewBlock(SCEVCheckBlock, Pred);
2206fe6060f1SDimitry Andric     DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2207fe6060f1SDimitry Andric 
2208c9157d92SDimitry Andric     BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond);
2209c9157d92SDimitry Andric     if (AddBranchWeights)
2210c9157d92SDimitry Andric       setBranchWeights(BI, SCEVCheckBypassWeights);
2211c9157d92SDimitry Andric     ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI);
2212fe6060f1SDimitry Andric     return SCEVCheckBlock;
2213fe6060f1SDimitry Andric   }
2214fe6060f1SDimitry Andric 
2215fe6060f1SDimitry Andric   /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2216fe6060f1SDimitry Andric   /// the branches to branch to the vector preheader or \p Bypass, depending on
2217fe6060f1SDimitry Andric   /// the generated condition.
emitMemRuntimeChecks(BasicBlock * Bypass,BasicBlock * LoopVectorPreHeader)221881ad6265SDimitry Andric   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2219fe6060f1SDimitry Andric                                    BasicBlock *LoopVectorPreHeader) {
2220fe6060f1SDimitry Andric     // Check if we generated code that checks in runtime if arrays overlap.
2221fe6060f1SDimitry Andric     if (!MemRuntimeCheckCond)
2222fe6060f1SDimitry Andric       return nullptr;
2223fe6060f1SDimitry Andric 
2224fe6060f1SDimitry Andric     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2225fe6060f1SDimitry Andric     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2226fe6060f1SDimitry Andric                                                 MemCheckBlock);
2227fe6060f1SDimitry Andric 
2228fe6060f1SDimitry Andric     DT->addNewBlock(MemCheckBlock, Pred);
2229fe6060f1SDimitry Andric     DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2230fe6060f1SDimitry Andric     MemCheckBlock->moveBefore(LoopVectorPreHeader);
2231fe6060f1SDimitry Andric 
2232*b9d9368bSDimitry Andric     if (OuterLoop)
2233*b9d9368bSDimitry Andric       OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI);
2234fe6060f1SDimitry Andric 
2235c9157d92SDimitry Andric     BranchInst &BI =
2236c9157d92SDimitry Andric         *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
2237c9157d92SDimitry Andric     if (AddBranchWeights) {
2238c9157d92SDimitry Andric       setBranchWeights(BI, MemCheckBypassWeights);
2239c9157d92SDimitry Andric     }
2240c9157d92SDimitry Andric     ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
2241fe6060f1SDimitry Andric     MemCheckBlock->getTerminator()->setDebugLoc(
2242fe6060f1SDimitry Andric         Pred->getTerminator()->getDebugLoc());
2243fe6060f1SDimitry Andric 
2244fe6060f1SDimitry Andric     // Mark the check as used, to prevent it from being removed during cleanup.
2245fe6060f1SDimitry Andric     MemRuntimeCheckCond = nullptr;
2246fe6060f1SDimitry Andric     return MemCheckBlock;
2247fe6060f1SDimitry Andric   }
2248fe6060f1SDimitry Andric };
2249bdd1243dSDimitry Andric } // namespace
2250fe6060f1SDimitry Andric 
useActiveLaneMask(TailFoldingStyle Style)2251fe013be4SDimitry Andric static bool useActiveLaneMask(TailFoldingStyle Style) {
2252fe013be4SDimitry Andric   return Style == TailFoldingStyle::Data ||
2253fe013be4SDimitry Andric          Style == TailFoldingStyle::DataAndControlFlow ||
2254fe013be4SDimitry Andric          Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2255fe013be4SDimitry Andric }
2256fe013be4SDimitry Andric 
useActiveLaneMaskForControlFlow(TailFoldingStyle Style)2257fe013be4SDimitry Andric static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
2258fe013be4SDimitry Andric   return Style == TailFoldingStyle::DataAndControlFlow ||
2259fe013be4SDimitry Andric          Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2260fe013be4SDimitry Andric }
2261fe013be4SDimitry Andric 
22620b57cec5SDimitry Andric // Return true if \p OuterLp is an outer loop annotated with hints for explicit
22630b57cec5SDimitry Andric // vectorization. The loop needs to be annotated with #pragma omp simd
22640b57cec5SDimitry Andric // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
22650b57cec5SDimitry Andric // vector length information is not provided, vectorization is not considered
22660b57cec5SDimitry Andric // explicit. Interleave hints are not allowed either. These limitations will be
22670b57cec5SDimitry Andric // relaxed in the future.
22680b57cec5SDimitry Andric // Please, note that we are currently forced to abuse the pragma 'clang
22690b57cec5SDimitry Andric // vectorize' semantics. This pragma provides *auto-vectorization hints*
22700b57cec5SDimitry Andric // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
22710b57cec5SDimitry Andric // provides *explicit vectorization hints* (LV can bypass legal checks and
22720b57cec5SDimitry Andric // assume that vectorization is legal). However, both hints are implemented
22730b57cec5SDimitry Andric // using the same metadata (llvm.loop.vectorize, processed by
22740b57cec5SDimitry Andric // LoopVectorizeHints). This will be fixed in the future when the native IR
22750b57cec5SDimitry Andric // representation for pragma 'omp simd' is introduced.
isExplicitVecOuterLoop(Loop * OuterLp,OptimizationRemarkEmitter * ORE)22760b57cec5SDimitry Andric static bool isExplicitVecOuterLoop(Loop *OuterLp,
22770b57cec5SDimitry Andric                                    OptimizationRemarkEmitter *ORE) {
2278e8d8bef9SDimitry Andric   assert(!OuterLp->isInnermost() && "This is not an outer loop");
22790b57cec5SDimitry Andric   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
22800b57cec5SDimitry Andric 
22810b57cec5SDimitry Andric   // Only outer loops with an explicit vectorization hint are supported.
22820b57cec5SDimitry Andric   // Unannotated outer loops are ignored.
22830b57cec5SDimitry Andric   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
22840b57cec5SDimitry Andric     return false;
22850b57cec5SDimitry Andric 
22860b57cec5SDimitry Andric   Function *Fn = OuterLp->getHeader()->getParent();
22870b57cec5SDimitry Andric   if (!Hints.allowVectorization(Fn, OuterLp,
22880b57cec5SDimitry Andric                                 true /*VectorizeOnlyWhenForced*/)) {
22890b57cec5SDimitry Andric     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
22900b57cec5SDimitry Andric     return false;
22910b57cec5SDimitry Andric   }
22920b57cec5SDimitry Andric 
22930b57cec5SDimitry Andric   if (Hints.getInterleave() > 1) {
22940b57cec5SDimitry Andric     // TODO: Interleave support is future work.
22950b57cec5SDimitry Andric     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
22960b57cec5SDimitry Andric                          "outer loops.\n");
22970b57cec5SDimitry Andric     Hints.emitRemarkWithHints();
22980b57cec5SDimitry Andric     return false;
22990b57cec5SDimitry Andric   }
23000b57cec5SDimitry Andric 
23010b57cec5SDimitry Andric   return true;
23020b57cec5SDimitry Andric }
23030b57cec5SDimitry Andric 
collectSupportedLoops(Loop & L,LoopInfo * LI,OptimizationRemarkEmitter * ORE,SmallVectorImpl<Loop * > & V)23040b57cec5SDimitry Andric static void collectSupportedLoops(Loop &L, LoopInfo *LI,
23050b57cec5SDimitry Andric                                   OptimizationRemarkEmitter *ORE,
23060b57cec5SDimitry Andric                                   SmallVectorImpl<Loop *> &V) {
23070b57cec5SDimitry Andric   // Collect inner loops and outer loops without irreducible control flow. For
23080b57cec5SDimitry Andric   // now, only collect outer loops that have explicit vectorization hints. If we
23090b57cec5SDimitry Andric   // are stress testing the VPlan H-CFG construction, we collect the outermost
23100b57cec5SDimitry Andric   // loop of every loop nest.
2311e8d8bef9SDimitry Andric   if (L.isInnermost() || VPlanBuildStressTest ||
23120b57cec5SDimitry Andric       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
23130b57cec5SDimitry Andric     LoopBlocksRPO RPOT(&L);
23140b57cec5SDimitry Andric     RPOT.perform(LI);
23150b57cec5SDimitry Andric     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
23160b57cec5SDimitry Andric       V.push_back(&L);
23170b57cec5SDimitry Andric       // TODO: Collect inner loops inside marked outer loops in case
23180b57cec5SDimitry Andric       // vectorization fails for the outer loop. Do not invoke
23190b57cec5SDimitry Andric       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
23200b57cec5SDimitry Andric       // already known to be reducible. We can use an inherited attribute for
23210b57cec5SDimitry Andric       // that.
23220b57cec5SDimitry Andric       return;
23230b57cec5SDimitry Andric     }
23240b57cec5SDimitry Andric   }
23250b57cec5SDimitry Andric   for (Loop *InnerL : L)
23260b57cec5SDimitry Andric     collectSupportedLoops(*InnerL, LI, ORE, V);
23270b57cec5SDimitry Andric }
23280b57cec5SDimitry Andric 
23290b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
23300b57cec5SDimitry Andric // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
23310b57cec5SDimitry Andric // LoopVectorizationCostModel and LoopVectorizationPlanner.
23320b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
23330b57cec5SDimitry Andric 
233481ad6265SDimitry Andric /// Compute the transformed value of Index at offset StartValue using step
233581ad6265SDimitry Andric /// StepValue.
233681ad6265SDimitry Andric /// For integer induction, returns StartValue + Index * StepValue.
233781ad6265SDimitry Andric /// For pointer induction, returns StartValue[Index * StepValue].
233881ad6265SDimitry Andric /// FIXME: The newly created binary instructions should contain nsw/nuw
233981ad6265SDimitry Andric /// flags, which can be found from the original scalar operations.
2340c9157d92SDimitry Andric static Value *
emitTransformedIndex(IRBuilderBase & B,Value * Index,Value * StartValue,Value * Step,InductionDescriptor::InductionKind InductionKind,const BinaryOperator * InductionBinOp)2341c9157d92SDimitry Andric emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue,
2342c9157d92SDimitry Andric                      Value *Step,
2343c9157d92SDimitry Andric                      InductionDescriptor::InductionKind InductionKind,
2344c9157d92SDimitry Andric                      const BinaryOperator *InductionBinOp) {
2345bdd1243dSDimitry Andric   Type *StepTy = Step->getType();
2346bdd1243dSDimitry Andric   Value *CastedIndex = StepTy->isIntegerTy()
2347bdd1243dSDimitry Andric                            ? B.CreateSExtOrTrunc(Index, StepTy)
2348bdd1243dSDimitry Andric                            : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2349bdd1243dSDimitry Andric   if (CastedIndex != Index) {
2350bdd1243dSDimitry Andric     CastedIndex->setName(CastedIndex->getName() + ".cast");
2351bdd1243dSDimitry Andric     Index = CastedIndex;
2352bdd1243dSDimitry Andric   }
235381ad6265SDimitry Andric 
235481ad6265SDimitry Andric   // Note: the IR at this point is broken. We cannot use SE to create any new
235581ad6265SDimitry Andric   // SCEV and then expand it, hoping that SCEV's simplification will give us
235681ad6265SDimitry Andric   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
235781ad6265SDimitry Andric   // lead to various SCEV crashes. So all we can do is to use builder and rely
235881ad6265SDimitry Andric   // on InstCombine for future simplifications. Here we handle some trivial
235981ad6265SDimitry Andric   // cases only.
236081ad6265SDimitry Andric   auto CreateAdd = [&B](Value *X, Value *Y) {
236181ad6265SDimitry Andric     assert(X->getType() == Y->getType() && "Types don't match!");
236281ad6265SDimitry Andric     if (auto *CX = dyn_cast<ConstantInt>(X))
236381ad6265SDimitry Andric       if (CX->isZero())
236481ad6265SDimitry Andric         return Y;
236581ad6265SDimitry Andric     if (auto *CY = dyn_cast<ConstantInt>(Y))
236681ad6265SDimitry Andric       if (CY->isZero())
236781ad6265SDimitry Andric         return X;
236881ad6265SDimitry Andric     return B.CreateAdd(X, Y);
236981ad6265SDimitry Andric   };
237081ad6265SDimitry Andric 
237181ad6265SDimitry Andric   // We allow X to be a vector type, in which case Y will potentially be
237281ad6265SDimitry Andric   // splatted into a vector with the same element count.
237381ad6265SDimitry Andric   auto CreateMul = [&B](Value *X, Value *Y) {
237481ad6265SDimitry Andric     assert(X->getType()->getScalarType() == Y->getType() &&
237581ad6265SDimitry Andric            "Types don't match!");
237681ad6265SDimitry Andric     if (auto *CX = dyn_cast<ConstantInt>(X))
237781ad6265SDimitry Andric       if (CX->isOne())
237881ad6265SDimitry Andric         return Y;
237981ad6265SDimitry Andric     if (auto *CY = dyn_cast<ConstantInt>(Y))
238081ad6265SDimitry Andric       if (CY->isOne())
238181ad6265SDimitry Andric         return X;
238281ad6265SDimitry Andric     VectorType *XVTy = dyn_cast<VectorType>(X->getType());
238381ad6265SDimitry Andric     if (XVTy && !isa<VectorType>(Y->getType()))
238481ad6265SDimitry Andric       Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
238581ad6265SDimitry Andric     return B.CreateMul(X, Y);
238681ad6265SDimitry Andric   };
238781ad6265SDimitry Andric 
2388c9157d92SDimitry Andric   switch (InductionKind) {
238981ad6265SDimitry Andric   case InductionDescriptor::IK_IntInduction: {
239081ad6265SDimitry Andric     assert(!isa<VectorType>(Index->getType()) &&
239181ad6265SDimitry Andric            "Vector indices not supported for integer inductions yet");
239281ad6265SDimitry Andric     assert(Index->getType() == StartValue->getType() &&
239381ad6265SDimitry Andric            "Index type does not match StartValue type");
239481ad6265SDimitry Andric     if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
239581ad6265SDimitry Andric       return B.CreateSub(StartValue, Index);
239681ad6265SDimitry Andric     auto *Offset = CreateMul(Index, Step);
239781ad6265SDimitry Andric     return CreateAdd(StartValue, Offset);
239881ad6265SDimitry Andric   }
2399a58f00eaSDimitry Andric   case InductionDescriptor::IK_PtrInduction:
2400a58f00eaSDimitry Andric     return B.CreatePtrAdd(StartValue, CreateMul(Index, Step));
240181ad6265SDimitry Andric   case InductionDescriptor::IK_FpInduction: {
240281ad6265SDimitry Andric     assert(!isa<VectorType>(Index->getType()) &&
240381ad6265SDimitry Andric            "Vector indices not supported for FP inductions yet");
240481ad6265SDimitry Andric     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
240581ad6265SDimitry Andric     assert(InductionBinOp &&
240681ad6265SDimitry Andric            (InductionBinOp->getOpcode() == Instruction::FAdd ||
240781ad6265SDimitry Andric             InductionBinOp->getOpcode() == Instruction::FSub) &&
240881ad6265SDimitry Andric            "Original bin op should be defined for FP induction");
240981ad6265SDimitry Andric 
241081ad6265SDimitry Andric     Value *MulExp = B.CreateFMul(Step, Index);
241181ad6265SDimitry Andric     return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
241281ad6265SDimitry Andric                          "induction");
241381ad6265SDimitry Andric   }
241481ad6265SDimitry Andric   case InductionDescriptor::IK_NoInduction:
241581ad6265SDimitry Andric     return nullptr;
241681ad6265SDimitry Andric   }
241781ad6265SDimitry Andric   llvm_unreachable("invalid enum");
241881ad6265SDimitry Andric }
241981ad6265SDimitry Andric 
getMaxVScale(const Function & F,const TargetTransformInfo & TTI)2420fe013be4SDimitry Andric std::optional<unsigned> getMaxVScale(const Function &F,
2421fe013be4SDimitry Andric                                      const TargetTransformInfo &TTI) {
2422fe013be4SDimitry Andric   if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2423fe013be4SDimitry Andric     return MaxVScale;
2424fe013be4SDimitry Andric 
2425fe013be4SDimitry Andric   if (F.hasFnAttribute(Attribute::VScaleRange))
2426fe013be4SDimitry Andric     return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2427fe013be4SDimitry Andric 
2428fe013be4SDimitry Andric   return std::nullopt;
2429fe013be4SDimitry Andric }
2430fe013be4SDimitry Andric 
2431fe013be4SDimitry Andric /// For the given VF and UF and maximum trip count computed for the loop, return
2432fe013be4SDimitry Andric /// whether the induction variable might overflow in the vectorized loop. If not,
2433fe013be4SDimitry Andric /// then we know a runtime overflow check always evaluates to false and can be
2434fe013be4SDimitry Andric /// removed.
isIndvarOverflowCheckKnownFalse(const LoopVectorizationCostModel * Cost,ElementCount VF,std::optional<unsigned> UF=std::nullopt)2435fe013be4SDimitry Andric static bool isIndvarOverflowCheckKnownFalse(
2436fe013be4SDimitry Andric     const LoopVectorizationCostModel *Cost,
2437fe013be4SDimitry Andric     ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2438fe013be4SDimitry Andric   // Always be conservative if we don't know the exact unroll factor.
2439fe013be4SDimitry Andric   unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2440fe013be4SDimitry Andric 
2441fe013be4SDimitry Andric   Type *IdxTy = Cost->Legal->getWidestInductionType();
2442fe013be4SDimitry Andric   APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask();
2443fe013be4SDimitry Andric 
2444fe013be4SDimitry Andric   // We know the runtime overflow check is known false iff the (max) trip-count
2445fe013be4SDimitry Andric   // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2446fe013be4SDimitry Andric   // the vector loop induction variable.
2447fe013be4SDimitry Andric   if (unsigned TC =
2448fe013be4SDimitry Andric           Cost->PSE.getSE()->getSmallConstantMaxTripCount(Cost->TheLoop)) {
2449fe013be4SDimitry Andric     uint64_t MaxVF = VF.getKnownMinValue();
2450fe013be4SDimitry Andric     if (VF.isScalable()) {
2451fe013be4SDimitry Andric       std::optional<unsigned> MaxVScale =
2452fe013be4SDimitry Andric           getMaxVScale(*Cost->TheFunction, Cost->TTI);
2453fe013be4SDimitry Andric       if (!MaxVScale)
2454fe013be4SDimitry Andric         return false;
2455fe013be4SDimitry Andric       MaxVF *= *MaxVScale;
2456fe013be4SDimitry Andric     }
2457fe013be4SDimitry Andric 
2458fe013be4SDimitry Andric     return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
2459fe013be4SDimitry Andric   }
2460fe013be4SDimitry Andric 
2461fe013be4SDimitry Andric   return false;
2462fe013be4SDimitry Andric }
2463fe013be4SDimitry Andric 
24640b57cec5SDimitry Andric // Return whether we allow using masked interleave-groups (for dealing with
24650b57cec5SDimitry Andric // strided loads/stores that reside in predicated blocks, or for dealing
24660b57cec5SDimitry Andric // with gaps).
useMaskedInterleavedAccesses(const TargetTransformInfo & TTI)24670b57cec5SDimitry Andric static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
24680b57cec5SDimitry Andric   // If an override option has been passed in for interleaved accesses, use it.
24690b57cec5SDimitry Andric   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
24700b57cec5SDimitry Andric     return EnableMaskedInterleavedMemAccesses;
24710b57cec5SDimitry Andric 
24720b57cec5SDimitry Andric   return TTI.enableMaskedInterleavedAccessVectorization();
24730b57cec5SDimitry Andric }
24740b57cec5SDimitry Andric 
24750b57cec5SDimitry Andric // Try to vectorize the interleave group that \p Instr belongs to.
24760b57cec5SDimitry Andric //
24770b57cec5SDimitry Andric // E.g. Translate following interleaved load group (factor = 3):
24780b57cec5SDimitry Andric //   for (i = 0; i < N; i+=3) {
24790b57cec5SDimitry Andric //     R = Pic[i];             // Member of index 0
24800b57cec5SDimitry Andric //     G = Pic[i+1];           // Member of index 1
24810b57cec5SDimitry Andric //     B = Pic[i+2];           // Member of index 2
24820b57cec5SDimitry Andric //     ... // do something to R, G, B
24830b57cec5SDimitry Andric //   }
24840b57cec5SDimitry Andric // To:
24850b57cec5SDimitry Andric //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2486e8d8bef9SDimitry Andric //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2487e8d8bef9SDimitry Andric //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2488e8d8bef9SDimitry Andric //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
24890b57cec5SDimitry Andric //
24900b57cec5SDimitry Andric // Or translate following interleaved store group (factor = 3):
24910b57cec5SDimitry Andric //   for (i = 0; i < N; i+=3) {
24920b57cec5SDimitry Andric //     ... do something to R, G, B
24930b57cec5SDimitry Andric //     Pic[i]   = R;           // Member of index 0
24940b57cec5SDimitry Andric //     Pic[i+1] = G;           // Member of index 1
24950b57cec5SDimitry Andric //     Pic[i+2] = B;           // Member of index 2
24960b57cec5SDimitry Andric //   }
24970b57cec5SDimitry Andric // To:
24980b57cec5SDimitry Andric //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2499e8d8bef9SDimitry Andric //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
25000b57cec5SDimitry Andric //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
25010b57cec5SDimitry Andric //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
25020b57cec5SDimitry Andric //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
vectorizeInterleaveGroup(const InterleaveGroup<Instruction> * Group,ArrayRef<VPValue * > VPDefs,VPTransformState & State,VPValue * Addr,ArrayRef<VPValue * > StoredValues,VPValue * BlockInMask,bool NeedsMaskForGaps)25035ffd83dbSDimitry Andric void InnerLoopVectorizer::vectorizeInterleaveGroup(
2504e8d8bef9SDimitry Andric     const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2505e8d8bef9SDimitry Andric     VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2506fe013be4SDimitry Andric     VPValue *BlockInMask, bool NeedsMaskForGaps) {
25075ffd83dbSDimitry Andric   Instruction *Instr = Group->getInsertPos();
25080b57cec5SDimitry Andric   const DataLayout &DL = Instr->getModule()->getDataLayout();
25090b57cec5SDimitry Andric 
25100b57cec5SDimitry Andric   // Prepare for the vector type of the interleaved load/store.
2511fe6060f1SDimitry Andric   Type *ScalarTy = getLoadStoreType(Instr);
25120b57cec5SDimitry Andric   unsigned InterleaveFactor = Group->getFactor();
2513e8d8bef9SDimitry Andric   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
25140b57cec5SDimitry Andric 
25150b57cec5SDimitry Andric   // Prepare for the new pointers.
2516480093f4SDimitry Andric   SmallVector<Value *, 2> AddrParts;
25170b57cec5SDimitry Andric   unsigned Index = Group->getIndex(Instr);
25180b57cec5SDimitry Andric 
25190b57cec5SDimitry Andric   // TODO: extend the masked interleaved-group support to reversed access.
2520480093f4SDimitry Andric   assert((!BlockInMask || !Group->isReverse()) &&
2521480093f4SDimitry Andric          "Reversed masked interleave-group not supported.");
25220b57cec5SDimitry Andric 
2523fe013be4SDimitry Andric   Value *Idx;
25240b57cec5SDimitry Andric   // If the group is reverse, adjust the index to refer to the last vector lane
25250b57cec5SDimitry Andric   // instead of the first. We adjust the index from the first vector lane,
25260b57cec5SDimitry Andric   // rather than directly getting the pointer for lane VF - 1, because the
25270b57cec5SDimitry Andric   // pointer operand of the interleaved access is supposed to be uniform. For
25280b57cec5SDimitry Andric   // uniform instructions, we're only required to generate a value for the
25290b57cec5SDimitry Andric   // first vector lane in each unroll iteration.
2530fe013be4SDimitry Andric   if (Group->isReverse()) {
2531fe013be4SDimitry Andric     Value *RuntimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF);
2532fe013be4SDimitry Andric     Idx = Builder.CreateSub(RuntimeVF, Builder.getInt32(1));
2533fe013be4SDimitry Andric     Idx = Builder.CreateMul(Idx, Builder.getInt32(Group->getFactor()));
2534fe013be4SDimitry Andric     Idx = Builder.CreateAdd(Idx, Builder.getInt32(Index));
2535fe013be4SDimitry Andric     Idx = Builder.CreateNeg(Idx);
2536fe013be4SDimitry Andric   } else
2537fe013be4SDimitry Andric     Idx = Builder.getInt32(-Index);
25380b57cec5SDimitry Andric 
25390b57cec5SDimitry Andric   for (unsigned Part = 0; Part < UF; Part++) {
2540fe6060f1SDimitry Andric     Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2541c9157d92SDimitry Andric     if (auto *I = dyn_cast<Instruction>(AddrPart))
2542c9157d92SDimitry Andric       State.setDebugLocFrom(I->getDebugLoc());
25430b57cec5SDimitry Andric 
25440b57cec5SDimitry Andric     // Notice current instruction could be any index. Need to adjust the address
25450b57cec5SDimitry Andric     // to the member of index 0.
25460b57cec5SDimitry Andric     //
25470b57cec5SDimitry Andric     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
25480b57cec5SDimitry Andric     //       b = A[i];       // Member of index 0
25490b57cec5SDimitry Andric     // Current pointer is pointed to A[i+1], adjust it to A[i].
25500b57cec5SDimitry Andric     //
25510b57cec5SDimitry Andric     // E.g.  A[i+1] = a;     // Member of index 1
25520b57cec5SDimitry Andric     //       A[i]   = b;     // Member of index 0
25530b57cec5SDimitry Andric     //       A[i+2] = c;     // Member of index 2 (Current instruction)
25540b57cec5SDimitry Andric     // Current pointer is pointed to A[i+2], adjust it to A[i].
2555480093f4SDimitry Andric 
2556480093f4SDimitry Andric     bool InBounds = false;
2557480093f4SDimitry Andric     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2558480093f4SDimitry Andric       InBounds = gep->isInBounds();
2559fe013be4SDimitry Andric     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds);
2560c9157d92SDimitry Andric     AddrParts.push_back(AddrPart);
25610b57cec5SDimitry Andric   }
25620b57cec5SDimitry Andric 
2563c9157d92SDimitry Andric   State.setDebugLocFrom(Instr->getDebugLoc());
2564e8d8bef9SDimitry Andric   Value *PoisonVec = PoisonValue::get(VecTy);
25650b57cec5SDimitry Andric 
2566fe013be4SDimitry Andric   auto CreateGroupMask = [this, &BlockInMask, &State, &InterleaveFactor](
2567fe013be4SDimitry Andric                              unsigned Part, Value *MaskForGaps) -> Value * {
2568fe013be4SDimitry Andric     if (VF.isScalable()) {
2569fe013be4SDimitry Andric       assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
2570fe013be4SDimitry Andric       assert(InterleaveFactor == 2 &&
2571fe013be4SDimitry Andric              "Unsupported deinterleave factor for scalable vectors");
2572fe013be4SDimitry Andric       auto *BlockInMaskPart = State.get(BlockInMask, Part);
2573fe013be4SDimitry Andric       SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart};
2574fe013be4SDimitry Andric       auto *MaskTy =
2575fe013be4SDimitry Andric           VectorType::get(Builder.getInt1Ty(), VF.getKnownMinValue() * 2, true);
2576fe013be4SDimitry Andric       return Builder.CreateIntrinsic(
2577fe013be4SDimitry Andric           MaskTy, Intrinsic::experimental_vector_interleave2, Ops,
2578fe013be4SDimitry Andric           /*FMFSource=*/nullptr, "interleaved.mask");
25790b57cec5SDimitry Andric     }
25800b57cec5SDimitry Andric 
2581fe013be4SDimitry Andric     if (!BlockInMask)
2582fe013be4SDimitry Andric       return MaskForGaps;
2583fe013be4SDimitry Andric 
2584fe013be4SDimitry Andric     Value *BlockInMaskPart = State.get(BlockInMask, Part);
2585fe013be4SDimitry Andric     Value *ShuffledMask = Builder.CreateShuffleVector(
2586fe013be4SDimitry Andric         BlockInMaskPart,
2587fe013be4SDimitry Andric         createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2588fe013be4SDimitry Andric         "interleaved.mask");
2589fe013be4SDimitry Andric     return MaskForGaps ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2590fe013be4SDimitry Andric                                              MaskForGaps)
2591fe013be4SDimitry Andric                        : ShuffledMask;
2592fe013be4SDimitry Andric   };
2593fe013be4SDimitry Andric 
25940b57cec5SDimitry Andric   // Vectorize the interleaved load group.
25950b57cec5SDimitry Andric   if (isa<LoadInst>(Instr)) {
2596fe013be4SDimitry Andric     Value *MaskForGaps = nullptr;
2597fe013be4SDimitry Andric     if (NeedsMaskForGaps) {
2598fe013be4SDimitry Andric       MaskForGaps =
2599fe013be4SDimitry Andric           createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2600fe013be4SDimitry Andric       assert(MaskForGaps && "Mask for Gaps is required but it is null");
2601fe013be4SDimitry Andric     }
2602fe013be4SDimitry Andric 
26030b57cec5SDimitry Andric     // For each unroll part, create a wide load for the group.
26040b57cec5SDimitry Andric     SmallVector<Value *, 2> NewLoads;
26050b57cec5SDimitry Andric     for (unsigned Part = 0; Part < UF; Part++) {
26060b57cec5SDimitry Andric       Instruction *NewLoad;
2607480093f4SDimitry Andric       if (BlockInMask || MaskForGaps) {
26080b57cec5SDimitry Andric         assert(useMaskedInterleavedAccesses(*TTI) &&
26090b57cec5SDimitry Andric                "masked interleaved groups are not allowed.");
2610fe013be4SDimitry Andric         Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
26110b57cec5SDimitry Andric         NewLoad =
2612fe6060f1SDimitry Andric             Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2613e8d8bef9SDimitry Andric                                      GroupMask, PoisonVec, "wide.masked.vec");
26140b57cec5SDimitry Andric       }
26150b57cec5SDimitry Andric       else
2616480093f4SDimitry Andric         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
26175ffd83dbSDimitry Andric                                             Group->getAlign(), "wide.vec");
26180b57cec5SDimitry Andric       Group->addMetadata(NewLoad);
26190b57cec5SDimitry Andric       NewLoads.push_back(NewLoad);
26200b57cec5SDimitry Andric     }
26210b57cec5SDimitry Andric 
2622fe013be4SDimitry Andric     if (VecTy->isScalableTy()) {
2623fe013be4SDimitry Andric       assert(InterleaveFactor == 2 &&
2624fe013be4SDimitry Andric              "Unsupported deinterleave factor for scalable vectors");
2625fe013be4SDimitry Andric 
2626fe013be4SDimitry Andric       for (unsigned Part = 0; Part < UF; ++Part) {
2627fe013be4SDimitry Andric         // Scalable vectors cannot use arbitrary shufflevectors (only splats),
2628fe013be4SDimitry Andric         // so must use intrinsics to deinterleave.
2629fe013be4SDimitry Andric         Value *DI = Builder.CreateIntrinsic(
2630fe013be4SDimitry Andric             Intrinsic::experimental_vector_deinterleave2, VecTy, NewLoads[Part],
2631fe013be4SDimitry Andric             /*FMFSource=*/nullptr, "strided.vec");
2632fe013be4SDimitry Andric         unsigned J = 0;
2633fe013be4SDimitry Andric         for (unsigned I = 0; I < InterleaveFactor; ++I) {
2634fe013be4SDimitry Andric           Instruction *Member = Group->getMember(I);
2635fe013be4SDimitry Andric 
2636fe013be4SDimitry Andric           if (!Member)
2637fe013be4SDimitry Andric             continue;
2638fe013be4SDimitry Andric 
2639fe013be4SDimitry Andric           Value *StridedVec = Builder.CreateExtractValue(DI, I);
2640fe013be4SDimitry Andric           // If this member has different type, cast the result type.
2641fe013be4SDimitry Andric           if (Member->getType() != ScalarTy) {
2642fe013be4SDimitry Andric             VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2643fe013be4SDimitry Andric             StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2644fe013be4SDimitry Andric           }
2645fe013be4SDimitry Andric 
2646fe013be4SDimitry Andric           if (Group->isReverse())
2647fe013be4SDimitry Andric             StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2648fe013be4SDimitry Andric 
2649fe013be4SDimitry Andric           State.set(VPDefs[J], StridedVec, Part);
2650fe013be4SDimitry Andric           ++J;
2651fe013be4SDimitry Andric         }
2652fe013be4SDimitry Andric       }
2653fe013be4SDimitry Andric 
2654fe013be4SDimitry Andric       return;
2655fe013be4SDimitry Andric     }
2656fe013be4SDimitry Andric 
26570b57cec5SDimitry Andric     // For each member in the group, shuffle out the appropriate data from the
26580b57cec5SDimitry Andric     // wide loads.
2659e8d8bef9SDimitry Andric     unsigned J = 0;
26600b57cec5SDimitry Andric     for (unsigned I = 0; I < InterleaveFactor; ++I) {
26610b57cec5SDimitry Andric       Instruction *Member = Group->getMember(I);
26620b57cec5SDimitry Andric 
26630b57cec5SDimitry Andric       // Skip the gaps in the group.
26640b57cec5SDimitry Andric       if (!Member)
26650b57cec5SDimitry Andric         continue;
26660b57cec5SDimitry Andric 
2667e8d8bef9SDimitry Andric       auto StrideMask =
2668e8d8bef9SDimitry Andric           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
26690b57cec5SDimitry Andric       for (unsigned Part = 0; Part < UF; Part++) {
26700b57cec5SDimitry Andric         Value *StridedVec = Builder.CreateShuffleVector(
2671e8d8bef9SDimitry Andric             NewLoads[Part], StrideMask, "strided.vec");
26720b57cec5SDimitry Andric 
26730b57cec5SDimitry Andric         // If this member has different type, cast the result type.
26740b57cec5SDimitry Andric         if (Member->getType() != ScalarTy) {
2675e8d8bef9SDimitry Andric           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2676e8d8bef9SDimitry Andric           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
26770b57cec5SDimitry Andric           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
26780b57cec5SDimitry Andric         }
26790b57cec5SDimitry Andric 
26800b57cec5SDimitry Andric         if (Group->isReverse())
268104eeddc0SDimitry Andric           StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
26820b57cec5SDimitry Andric 
2683fe6060f1SDimitry Andric         State.set(VPDefs[J], StridedVec, Part);
26840b57cec5SDimitry Andric       }
2685e8d8bef9SDimitry Andric       ++J;
26860b57cec5SDimitry Andric     }
26870b57cec5SDimitry Andric     return;
26880b57cec5SDimitry Andric   }
26890b57cec5SDimitry Andric 
26900b57cec5SDimitry Andric   // The sub vector type for current instruction.
2691e8d8bef9SDimitry Andric   auto *SubVT = VectorType::get(ScalarTy, VF);
26920b57cec5SDimitry Andric 
26930b57cec5SDimitry Andric   // Vectorize the interleaved store group.
2694fe013be4SDimitry Andric   Value *MaskForGaps =
2695fe013be4SDimitry Andric       createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2696349cc55cSDimitry Andric   assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
2697349cc55cSDimitry Andric          "masked interleaved groups are not allowed.");
2698349cc55cSDimitry Andric   assert((!MaskForGaps || !VF.isScalable()) &&
2699349cc55cSDimitry Andric          "masking gaps for scalable vectors is not yet supported.");
27000b57cec5SDimitry Andric   for (unsigned Part = 0; Part < UF; Part++) {
27010b57cec5SDimitry Andric     // Collect the stored vector from each member.
27020b57cec5SDimitry Andric     SmallVector<Value *, 4> StoredVecs;
2703bdd1243dSDimitry Andric     unsigned StoredIdx = 0;
27040b57cec5SDimitry Andric     for (unsigned i = 0; i < InterleaveFactor; i++) {
2705349cc55cSDimitry Andric       assert((Group->getMember(i) || MaskForGaps) &&
2706349cc55cSDimitry Andric              "Fail to get a member from an interleaved store group");
2707349cc55cSDimitry Andric       Instruction *Member = Group->getMember(i);
2708349cc55cSDimitry Andric 
2709349cc55cSDimitry Andric       // Skip the gaps in the group.
2710349cc55cSDimitry Andric       if (!Member) {
2711349cc55cSDimitry Andric         Value *Undef = PoisonValue::get(SubVT);
2712349cc55cSDimitry Andric         StoredVecs.push_back(Undef);
2713349cc55cSDimitry Andric         continue;
2714349cc55cSDimitry Andric       }
27150b57cec5SDimitry Andric 
2716bdd1243dSDimitry Andric       Value *StoredVec = State.get(StoredValues[StoredIdx], Part);
2717bdd1243dSDimitry Andric       ++StoredIdx;
2718e8d8bef9SDimitry Andric 
27190b57cec5SDimitry Andric       if (Group->isReverse())
272004eeddc0SDimitry Andric         StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");
27210b57cec5SDimitry Andric 
27220b57cec5SDimitry Andric       // If this member has different type, cast it to a unified type.
27230b57cec5SDimitry Andric 
27240b57cec5SDimitry Andric       if (StoredVec->getType() != SubVT)
27250b57cec5SDimitry Andric         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
27260b57cec5SDimitry Andric 
27270b57cec5SDimitry Andric       StoredVecs.push_back(StoredVec);
27280b57cec5SDimitry Andric     }
27290b57cec5SDimitry Andric 
2730fe013be4SDimitry Andric     // Interleave all the smaller vectors into one wider vector.
2731fe013be4SDimitry Andric     Value *IVec = interleaveVectors(Builder, StoredVecs, "interleaved.vec");
27320b57cec5SDimitry Andric     Instruction *NewStoreInstr;
2733349cc55cSDimitry Andric     if (BlockInMask || MaskForGaps) {
2734fe013be4SDimitry Andric       Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2735349cc55cSDimitry Andric       NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2736349cc55cSDimitry Andric                                                 Group->getAlign(), GroupMask);
2737349cc55cSDimitry Andric     } else
27385ffd83dbSDimitry Andric       NewStoreInstr =
27395ffd83dbSDimitry Andric           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
27400b57cec5SDimitry Andric 
27410b57cec5SDimitry Andric     Group->addMetadata(NewStoreInstr);
27420b57cec5SDimitry Andric   }
27430b57cec5SDimitry Andric }
27440b57cec5SDimitry Andric 
scalarizeInstruction(const Instruction * Instr,VPReplicateRecipe * RepRecipe,const VPIteration & Instance,VPTransformState & State)2745bdd1243dSDimitry Andric void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
27464824e7fdSDimitry Andric                                                VPReplicateRecipe *RepRecipe,
27470b57cec5SDimitry Andric                                                const VPIteration &Instance,
27485ffd83dbSDimitry Andric                                                VPTransformState &State) {
27490b57cec5SDimitry Andric   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
27500b57cec5SDimitry Andric 
2751e8d8bef9SDimitry Andric   // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2752e8d8bef9SDimitry Andric   // the first lane and part.
2753e8d8bef9SDimitry Andric   if (isa<NoAliasScopeDeclInst>(Instr))
2754fe6060f1SDimitry Andric     if (!Instance.isFirstIteration())
2755e8d8bef9SDimitry Andric       return;
2756e8d8bef9SDimitry Andric 
27570b57cec5SDimitry Andric   // Does this instruction return a value ?
27580b57cec5SDimitry Andric   bool IsVoidRetTy = Instr->getType()->isVoidTy();
27590b57cec5SDimitry Andric 
27600b57cec5SDimitry Andric   Instruction *Cloned = Instr->clone();
2761c9157d92SDimitry Andric   if (!IsVoidRetTy) {
27620b57cec5SDimitry Andric     Cloned->setName(Instr->getName() + ".cloned");
2763c9157d92SDimitry Andric #if !defined(NDEBUG)
2764c9157d92SDimitry Andric     // Verify that VPlan type inference results agree with the type of the
2765c9157d92SDimitry Andric     // generated values.
2766c9157d92SDimitry Andric     assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() &&
2767c9157d92SDimitry Andric            "inferred type and type from generated instructions do not match");
2768c9157d92SDimitry Andric #endif
2769c9157d92SDimitry Andric   }
27700b57cec5SDimitry Andric 
2771fe013be4SDimitry Andric   RepRecipe->setFlags(Cloned);
27724824e7fdSDimitry Andric 
2773c9157d92SDimitry Andric   if (auto DL = Instr->getDebugLoc())
2774c9157d92SDimitry Andric     State.setDebugLocFrom(DL);
277581ad6265SDimitry Andric 
27760b57cec5SDimitry Andric   // Replace the operands of the cloned instructions with their scalar
27770b57cec5SDimitry Andric   // equivalents in the new loop.
2778bdd1243dSDimitry Andric   for (const auto &I : enumerate(RepRecipe->operands())) {
2779e8d8bef9SDimitry Andric     auto InputInstance = Instance;
27800eae32dcSDimitry Andric     VPValue *Operand = I.value();
2781bdd1243dSDimitry Andric     if (vputils::isUniformAfterVectorization(Operand))
2782fe6060f1SDimitry Andric       InputInstance.Lane = VPLane::getFirstLane();
27830eae32dcSDimitry Andric     Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
27840b57cec5SDimitry Andric   }
278581ad6265SDimitry Andric   State.addNewMetadata(Cloned, Instr);
27860b57cec5SDimitry Andric 
27870b57cec5SDimitry Andric   // Place the cloned scalar in the new loop.
278881ad6265SDimitry Andric   State.Builder.Insert(Cloned);
27890b57cec5SDimitry Andric 
27904824e7fdSDimitry Andric   State.set(RepRecipe, Cloned, Instance);
27910b57cec5SDimitry Andric 
27920b57cec5SDimitry Andric   // If we just cloned a new assumption, add it the assumption cache.
2793fe6060f1SDimitry Andric   if (auto *II = dyn_cast<AssumeInst>(Cloned))
27940b57cec5SDimitry Andric     AC->registerAssumption(II);
27950b57cec5SDimitry Andric 
27960b57cec5SDimitry Andric   // End if-block.
2797fe013be4SDimitry Andric   bool IfPredicateInstr = RepRecipe->getParent()->getParent()->isReplicator();
27980b57cec5SDimitry Andric   if (IfPredicateInstr)
27990b57cec5SDimitry Andric     PredicatedInstructions.push_back(Cloned);
28000b57cec5SDimitry Andric }
28010b57cec5SDimitry Andric 
280281ad6265SDimitry Andric Value *
getOrCreateVectorTripCount(BasicBlock * InsertBlock)280381ad6265SDimitry Andric InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
28040b57cec5SDimitry Andric   if (VectorTripCount)
28050b57cec5SDimitry Andric     return VectorTripCount;
28060b57cec5SDimitry Andric 
2807fe013be4SDimitry Andric   Value *TC = getTripCount();
280881ad6265SDimitry Andric   IRBuilder<> Builder(InsertBlock->getTerminator());
28090b57cec5SDimitry Andric 
28100b57cec5SDimitry Andric   Type *Ty = TC->getType();
2811e8d8bef9SDimitry Andric   // This is where we can make the step a runtime constant.
2812349cc55cSDimitry Andric   Value *Step = createStepForVF(Builder, Ty, VF, UF);
28130b57cec5SDimitry Andric 
28140b57cec5SDimitry Andric   // If the tail is to be folded by masking, round the number of iterations N
28150b57cec5SDimitry Andric   // up to a multiple of Step instead of rounding down. This is done by first
28160b57cec5SDimitry Andric   // adding Step-1 and then rounding down. Note that it's ok if this addition
28170b57cec5SDimitry Andric   // overflows: the vector induction variable will eventually wrap to zero given
28180b57cec5SDimitry Andric   // that it starts at zero and its Step is a power of two; the loop will then
28190b57cec5SDimitry Andric   // exit, with the last early-exit vector comparison also producing all-true.
282081ad6265SDimitry Andric   // For scalable vectors the VF is not guaranteed to be a power of 2, but this
282181ad6265SDimitry Andric   // is accounted for in emitIterationCountCheck that adds an overflow check.
28220b57cec5SDimitry Andric   if (Cost->foldTailByMasking()) {
2823e8d8bef9SDimitry Andric     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
28240b57cec5SDimitry Andric            "VF*UF must be a power of 2 when folding tail by masking");
282504eeddc0SDimitry Andric     Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF);
2826e8d8bef9SDimitry Andric     TC = Builder.CreateAdd(
282704eeddc0SDimitry Andric         TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up");
28280b57cec5SDimitry Andric   }
28290b57cec5SDimitry Andric 
28300b57cec5SDimitry Andric   // Now we need to generate the expression for the part of the loop that the
28310b57cec5SDimitry Andric   // vectorized body will execute. This is equal to N - (N % Step) if scalar
28320b57cec5SDimitry Andric   // iterations are not required for correctness, or N - Step, otherwise. Step
28330b57cec5SDimitry Andric   // is equal to the vectorization factor (number of SIMD elements) times the
28340b57cec5SDimitry Andric   // unroll factor (number of SIMD instructions).
28350b57cec5SDimitry Andric   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
28360b57cec5SDimitry Andric 
2837fe6060f1SDimitry Andric   // There are cases where we *must* run at least one iteration in the remainder
2838fe6060f1SDimitry Andric   // loop.  See the cost model for when this can happen.  If the step evenly
2839fe6060f1SDimitry Andric   // divides the trip count, we set the remainder to be equal to the step. If
2840fe6060f1SDimitry Andric   // the step does not evenly divide the trip count, no adjustment is necessary
2841fe6060f1SDimitry Andric   // since there will already be scalar iterations. Note that the minimum
2842fe6060f1SDimitry Andric   // iterations check ensures that N >= Step.
2843fe013be4SDimitry Andric   if (Cost->requiresScalarEpilogue(VF.isVector())) {
28440b57cec5SDimitry Andric     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
28450b57cec5SDimitry Andric     R = Builder.CreateSelect(IsZero, Step, R);
28460b57cec5SDimitry Andric   }
28470b57cec5SDimitry Andric 
28480b57cec5SDimitry Andric   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
28490b57cec5SDimitry Andric 
28500b57cec5SDimitry Andric   return VectorTripCount;
28510b57cec5SDimitry Andric }
28520b57cec5SDimitry Andric 
createBitOrPointerCast(Value * V,VectorType * DstVTy,const DataLayout & DL)28530b57cec5SDimitry Andric Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
28540b57cec5SDimitry Andric                                                    const DataLayout &DL) {
28550b57cec5SDimitry Andric   // Verify that V is a vector type with same number of elements as DstVTy.
2856fe013be4SDimitry Andric   auto *DstFVTy = cast<VectorType>(DstVTy);
2857fe013be4SDimitry Andric   auto VF = DstFVTy->getElementCount();
2858fe013be4SDimitry Andric   auto *SrcVecTy = cast<VectorType>(V->getType());
2859fe013be4SDimitry Andric   assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
28600b57cec5SDimitry Andric   Type *SrcElemTy = SrcVecTy->getElementType();
2861e8d8bef9SDimitry Andric   Type *DstElemTy = DstFVTy->getElementType();
28620b57cec5SDimitry Andric   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
28630b57cec5SDimitry Andric          "Vector elements must have same size");
28640b57cec5SDimitry Andric 
28650b57cec5SDimitry Andric   // Do a direct cast if element types are castable.
28660b57cec5SDimitry Andric   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2867e8d8bef9SDimitry Andric     return Builder.CreateBitOrPointerCast(V, DstFVTy);
28680b57cec5SDimitry Andric   }
28690b57cec5SDimitry Andric   // V cannot be directly casted to desired vector type.
28700b57cec5SDimitry Andric   // May happen when V is a floating point vector but DstVTy is a vector of
28710b57cec5SDimitry Andric   // pointers or vice-versa. Handle this using a two-step bitcast using an
28720b57cec5SDimitry Andric   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
28730b57cec5SDimitry Andric   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
28740b57cec5SDimitry Andric          "Only one type should be a pointer type");
28750b57cec5SDimitry Andric   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
28760b57cec5SDimitry Andric          "Only one type should be a floating point type");
28770b57cec5SDimitry Andric   Type *IntTy =
28780b57cec5SDimitry Andric       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2879fe013be4SDimitry Andric   auto *VecIntTy = VectorType::get(IntTy, VF);
28800b57cec5SDimitry Andric   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2881e8d8bef9SDimitry Andric   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
28820b57cec5SDimitry Andric }
28830b57cec5SDimitry Andric 
emitIterationCountCheck(BasicBlock * Bypass)288481ad6265SDimitry Andric void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
2885fe013be4SDimitry Andric   Value *Count = getTripCount();
2886480093f4SDimitry Andric   // Reuse existing vector loop preheader for TC checks.
2887480093f4SDimitry Andric   // Note that new preheader block is generated for vector loop.
2888480093f4SDimitry Andric   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2889480093f4SDimitry Andric   IRBuilder<> Builder(TCCheckBlock->getTerminator());
28900b57cec5SDimitry Andric 
28910b57cec5SDimitry Andric   // Generate code to check if the loop's trip count is less than VF * UF, or
28920b57cec5SDimitry Andric   // equal to it in case a scalar epilogue is required; this implies that the
28930b57cec5SDimitry Andric   // vector trip count is zero. This check also covers the case where adding one
28940b57cec5SDimitry Andric   // to the backedge-taken count overflowed leading to an incorrect trip count
28950b57cec5SDimitry Andric   // of zero. In this case we will also jump to the scalar loop.
2896fe013be4SDimitry Andric   auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
28970b57cec5SDimitry Andric                                                        : ICmpInst::ICMP_ULT;
28980b57cec5SDimitry Andric 
28990b57cec5SDimitry Andric   // If tail is to be folded, vector loop takes care of all iterations.
290081ad6265SDimitry Andric   Type *CountTy = Count->getType();
29010b57cec5SDimitry Andric   Value *CheckMinIters = Builder.getFalse();
2902fcaf7f86SDimitry Andric   auto CreateStep = [&]() -> Value * {
2903753f127fSDimitry Andric     // Create step with max(MinProTripCount, UF * VF).
2904fcaf7f86SDimitry Andric     if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
2905753f127fSDimitry Andric       return createStepForVF(Builder, CountTy, VF, UF);
2906fcaf7f86SDimitry Andric 
2907fcaf7f86SDimitry Andric     Value *MinProfTC =
2908fcaf7f86SDimitry Andric         createStepForVF(Builder, CountTy, MinProfitableTripCount, 1);
2909fcaf7f86SDimitry Andric     if (!VF.isScalable())
2910fcaf7f86SDimitry Andric       return MinProfTC;
2911fcaf7f86SDimitry Andric     return Builder.CreateBinaryIntrinsic(
2912fcaf7f86SDimitry Andric         Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2913753f127fSDimitry Andric   };
2914753f127fSDimitry Andric 
2915fe013be4SDimitry Andric   TailFoldingStyle Style = Cost->getTailFoldingStyle();
2916fe013be4SDimitry Andric   if (Style == TailFoldingStyle::None)
2917753f127fSDimitry Andric     CheckMinIters =
2918753f127fSDimitry Andric         Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check");
2919fe013be4SDimitry Andric   else if (VF.isScalable() &&
2920fe013be4SDimitry Andric            !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) &&
2921fe013be4SDimitry Andric            Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
292281ad6265SDimitry Andric     // vscale is not necessarily a power-of-2, which means we cannot guarantee
292381ad6265SDimitry Andric     // an overflow to zero when updating induction variables and so an
292481ad6265SDimitry Andric     // additional overflow check is required before entering the vector loop.
292581ad6265SDimitry Andric 
292681ad6265SDimitry Andric     // Get the maximum unsigned value for the type.
292781ad6265SDimitry Andric     Value *MaxUIntTripCount =
292881ad6265SDimitry Andric         ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
292981ad6265SDimitry Andric     Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
293081ad6265SDimitry Andric 
293181ad6265SDimitry Andric     // Don't execute the vector loop if (UMax - n) < (VF * UF).
2932753f127fSDimitry Andric     CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
2933e8d8bef9SDimitry Andric   }
2934753f127fSDimitry Andric 
2935480093f4SDimitry Andric   // Create new preheader for vector loop.
2936480093f4SDimitry Andric   LoopVectorPreHeader =
2937480093f4SDimitry Andric       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2938480093f4SDimitry Andric                  "vector.ph");
2939480093f4SDimitry Andric 
2940480093f4SDimitry Andric   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2941480093f4SDimitry Andric                                DT->getNode(Bypass)->getIDom()) &&
2942480093f4SDimitry Andric          "TC check is expected to dominate Bypass");
2943480093f4SDimitry Andric 
2944fe6060f1SDimitry Andric   // Update dominator for Bypass & LoopExit (if needed).
2945480093f4SDimitry Andric   DT->changeImmediateDominator(Bypass, TCCheckBlock);
2946fe013be4SDimitry Andric   if (!Cost->requiresScalarEpilogue(VF.isVector()))
2947fe6060f1SDimitry Andric     // If there is an epilogue which must run, there's no edge from the
2948fe6060f1SDimitry Andric     // middle block to exit blocks  and thus no need to update the immediate
2949fe6060f1SDimitry Andric     // dominator of the exit blocks.
2950480093f4SDimitry Andric     DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
2951480093f4SDimitry Andric 
2952c9157d92SDimitry Andric   BranchInst &BI =
2953c9157d92SDimitry Andric       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
2954c9157d92SDimitry Andric   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
2955c9157d92SDimitry Andric     setBranchWeights(BI, MinItersBypassWeights);
2956c9157d92SDimitry Andric   ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
2957480093f4SDimitry Andric   LoopBypassBlocks.push_back(TCCheckBlock);
29580b57cec5SDimitry Andric }
29590b57cec5SDimitry Andric 
emitSCEVChecks(BasicBlock * Bypass)296081ad6265SDimitry Andric BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
2961fe6060f1SDimitry Andric   BasicBlock *const SCEVCheckBlock =
296281ad6265SDimitry Andric       RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
2963fe6060f1SDimitry Andric   if (!SCEVCheckBlock)
2964fe6060f1SDimitry Andric     return nullptr;
29650b57cec5SDimitry Andric 
2966e8d8bef9SDimitry Andric   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2967e8d8bef9SDimitry Andric            (OptForSizeBasedOnProfile &&
2968e8d8bef9SDimitry Andric             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
29698bcb0991SDimitry Andric          "Cannot SCEV check stride or overflow when optimizing for size");
29708bcb0991SDimitry Andric 
2971480093f4SDimitry Andric 
2972480093f4SDimitry Andric   // Update dominator only if this is first RT check.
2973480093f4SDimitry Andric   if (LoopBypassBlocks.empty()) {
2974480093f4SDimitry Andric     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2975fe013be4SDimitry Andric     if (!Cost->requiresScalarEpilogue(VF.isVector()))
2976fe6060f1SDimitry Andric       // If there is an epilogue which must run, there's no edge from the
2977fe6060f1SDimitry Andric       // middle block to exit blocks  and thus no need to update the immediate
2978fe6060f1SDimitry Andric       // dominator of the exit blocks.
2979480093f4SDimitry Andric       DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2980480093f4SDimitry Andric   }
2981480093f4SDimitry Andric 
2982480093f4SDimitry Andric   LoopBypassBlocks.push_back(SCEVCheckBlock);
29830b57cec5SDimitry Andric   AddedSafetyChecks = true;
2984fe6060f1SDimitry Andric   return SCEVCheckBlock;
29850b57cec5SDimitry Andric }
29860b57cec5SDimitry Andric 
emitMemRuntimeChecks(BasicBlock * Bypass)298781ad6265SDimitry Andric BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
29880b57cec5SDimitry Andric   // VPlan-native path does not do any analysis for runtime checks currently.
29890b57cec5SDimitry Andric   if (EnableVPlanNativePath)
2990fe6060f1SDimitry Andric     return nullptr;
29910b57cec5SDimitry Andric 
2992fe6060f1SDimitry Andric   BasicBlock *const MemCheckBlock =
299381ad6265SDimitry Andric       RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
29940b57cec5SDimitry Andric 
2995fe6060f1SDimitry Andric   // Check if we generated code that checks in runtime if arrays overlap. We put
2996fe6060f1SDimitry Andric   // the checks into a separate block to make the more common case of few
2997fe6060f1SDimitry Andric   // elements faster.
2998fe6060f1SDimitry Andric   if (!MemCheckBlock)
2999fe6060f1SDimitry Andric     return nullptr;
30000b57cec5SDimitry Andric 
3001e8d8bef9SDimitry Andric   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
30028bcb0991SDimitry Andric     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
30038bcb0991SDimitry Andric            "Cannot emit memory checks when optimizing for size, unless forced "
30048bcb0991SDimitry Andric            "to vectorize.");
30058bcb0991SDimitry Andric     ORE->emit([&]() {
30068bcb0991SDimitry Andric       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
300781ad6265SDimitry Andric                                         OrigLoop->getStartLoc(),
300881ad6265SDimitry Andric                                         OrigLoop->getHeader())
30098bcb0991SDimitry Andric              << "Code-size may be reduced by not forcing "
30108bcb0991SDimitry Andric                 "vectorization, or by source-code modifications "
30118bcb0991SDimitry Andric                 "eliminating the need for runtime checks "
30128bcb0991SDimitry Andric                 "(e.g., adding 'restrict').";
30138bcb0991SDimitry Andric     });
30148bcb0991SDimitry Andric   }
30158bcb0991SDimitry Andric 
3016e8d8bef9SDimitry Andric   LoopBypassBlocks.push_back(MemCheckBlock);
3017fe6060f1SDimitry Andric 
3018e8d8bef9SDimitry Andric   AddedSafetyChecks = true;
3019e8d8bef9SDimitry Andric 
3020fe6060f1SDimitry Andric   return MemCheckBlock;
30210b57cec5SDimitry Andric }
30220b57cec5SDimitry Andric 
createVectorLoopSkeleton(StringRef Prefix)302381ad6265SDimitry Andric void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3024e8d8bef9SDimitry Andric   LoopScalarBody = OrigLoop->getHeader();
3025e8d8bef9SDimitry Andric   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3026e8d8bef9SDimitry Andric   assert(LoopVectorPreHeader && "Invalid loop structure");
3027fe6060f1SDimitry Andric   LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
3028fe013be4SDimitry Andric   assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) &&
3029fe6060f1SDimitry Andric          "multiple exit loop without required epilogue?");
3030e8d8bef9SDimitry Andric 
3031e8d8bef9SDimitry Andric   LoopMiddleBlock =
3032e8d8bef9SDimitry Andric       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3033e8d8bef9SDimitry Andric                  LI, nullptr, Twine(Prefix) + "middle.block");
3034e8d8bef9SDimitry Andric   LoopScalarPreHeader =
3035e8d8bef9SDimitry Andric       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3036e8d8bef9SDimitry Andric                  nullptr, Twine(Prefix) + "scalar.ph");
3037e8d8bef9SDimitry Andric 
3038e8d8bef9SDimitry Andric   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3039fe6060f1SDimitry Andric 
3040fe6060f1SDimitry Andric   // Set up the middle block terminator.  Two cases:
3041fe6060f1SDimitry Andric   // 1) If we know that we must execute the scalar epilogue, emit an
3042fe6060f1SDimitry Andric   //    unconditional branch.
3043fe6060f1SDimitry Andric   // 2) Otherwise, we must have a single unique exit block (due to how we
3044bdd1243dSDimitry Andric   //    implement the multiple exit case).  In this case, set up a conditional
3045fe6060f1SDimitry Andric   //    branch from the middle block to the loop scalar preheader, and the
3046fe6060f1SDimitry Andric   //    exit block.  completeLoopSkeleton will update the condition to use an
3047fe6060f1SDimitry Andric   //    iteration check, if required to decide whether to execute the remainder.
3048fe013be4SDimitry Andric   BranchInst *BrInst =
3049fe013be4SDimitry Andric       Cost->requiresScalarEpilogue(VF.isVector())
3050fe013be4SDimitry Andric           ? BranchInst::Create(LoopScalarPreHeader)
3051fe013be4SDimitry Andric           : BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
3052fe6060f1SDimitry Andric                                Builder.getTrue());
3053e8d8bef9SDimitry Andric   BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3054e8d8bef9SDimitry Andric   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3055e8d8bef9SDimitry Andric 
305681ad6265SDimitry Andric   // Update dominator for loop exit. During skeleton creation, only the vector
305781ad6265SDimitry Andric   // pre-header and the middle block are created. The vector loop is entirely
305881ad6265SDimitry Andric   // created during VPlan exection.
3059fe013be4SDimitry Andric   if (!Cost->requiresScalarEpilogue(VF.isVector()))
3060fe6060f1SDimitry Andric     // If there is an epilogue which must run, there's no edge from the
3061fe6060f1SDimitry Andric     // middle block to exit blocks  and thus no need to update the immediate
3062fe6060f1SDimitry Andric     // dominator of the exit blocks.
3063e8d8bef9SDimitry Andric     DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3064e8d8bef9SDimitry Andric }
3065e8d8bef9SDimitry Andric 
createInductionResumeValue(PHINode * OrigPhi,const InductionDescriptor & II,Value * Step,ArrayRef<BasicBlock * > BypassBlocks,std::pair<BasicBlock *,Value * > AdditionalBypass)3066bdd1243dSDimitry Andric PHINode *InnerLoopVectorizer::createInductionResumeValue(
3067fe013be4SDimitry Andric     PHINode *OrigPhi, const InductionDescriptor &II, Value *Step,
3068bdd1243dSDimitry Andric     ArrayRef<BasicBlock *> BypassBlocks,
306981ad6265SDimitry Andric     std::pair<BasicBlock *, Value *> AdditionalBypass) {
307081ad6265SDimitry Andric   Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
307181ad6265SDimitry Andric   assert(VectorTripCount && "Expected valid arguments");
3072e8d8bef9SDimitry Andric 
3073bdd1243dSDimitry Andric   Instruction *OldInduction = Legal->getPrimaryInduction();
3074e8d8bef9SDimitry Andric   Value *&EndValue = IVEndValues[OrigPhi];
3075e8d8bef9SDimitry Andric   Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3076e8d8bef9SDimitry Andric   if (OrigPhi == OldInduction) {
3077e8d8bef9SDimitry Andric     // We know what the end value is.
3078e8d8bef9SDimitry Andric     EndValue = VectorTripCount;
3079e8d8bef9SDimitry Andric   } else {
308081ad6265SDimitry Andric     IRBuilder<> B(LoopVectorPreHeader->getTerminator());
3081fe6060f1SDimitry Andric 
3082fe6060f1SDimitry Andric     // Fast-math-flags propagate from the original induction instruction.
3083fe6060f1SDimitry Andric     if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3084fe6060f1SDimitry Andric       B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3085fe6060f1SDimitry Andric 
3086c9157d92SDimitry Andric     EndValue = emitTransformedIndex(B, VectorTripCount, II.getStartValue(),
3087c9157d92SDimitry Andric                                     Step, II.getKind(), II.getInductionBinOp());
3088e8d8bef9SDimitry Andric     EndValue->setName("ind.end");
3089e8d8bef9SDimitry Andric 
3090e8d8bef9SDimitry Andric     // Compute the end value for the additional bypass (if applicable).
3091e8d8bef9SDimitry Andric     if (AdditionalBypass.first) {
3092c9157d92SDimitry Andric       B.SetInsertPoint(AdditionalBypass.first,
3093c9157d92SDimitry Andric                        AdditionalBypass.first->getFirstInsertionPt());
3094c9157d92SDimitry Andric       EndValueFromAdditionalBypass =
3095c9157d92SDimitry Andric           emitTransformedIndex(B, AdditionalBypass.second, II.getStartValue(),
3096c9157d92SDimitry Andric                                Step, II.getKind(), II.getInductionBinOp());
3097e8d8bef9SDimitry Andric       EndValueFromAdditionalBypass->setName("ind.end");
3098e8d8bef9SDimitry Andric     }
3099e8d8bef9SDimitry Andric   }
310081ad6265SDimitry Andric 
310181ad6265SDimitry Andric   // Create phi nodes to merge from the  backedge-taken check block.
3102bdd1243dSDimitry Andric   PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
310381ad6265SDimitry Andric                                          LoopScalarPreHeader->getTerminator());
310481ad6265SDimitry Andric   // Copy original phi DL over to the new one.
310581ad6265SDimitry Andric   BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
310681ad6265SDimitry Andric 
3107e8d8bef9SDimitry Andric   // The new PHI merges the original incoming value, in case of a bypass,
3108e8d8bef9SDimitry Andric   // or the value at the end of the vectorized loop.
3109e8d8bef9SDimitry Andric   BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3110e8d8bef9SDimitry Andric 
3111e8d8bef9SDimitry Andric   // Fix the scalar body counter (PHI node).
3112e8d8bef9SDimitry Andric   // The old induction's phi node in the scalar body needs the truncated
3113e8d8bef9SDimitry Andric   // value.
3114bdd1243dSDimitry Andric   for (BasicBlock *BB : BypassBlocks)
3115e8d8bef9SDimitry Andric     BCResumeVal->addIncoming(II.getStartValue(), BB);
3116e8d8bef9SDimitry Andric 
3117e8d8bef9SDimitry Andric   if (AdditionalBypass.first)
3118e8d8bef9SDimitry Andric     BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3119e8d8bef9SDimitry Andric                                           EndValueFromAdditionalBypass);
3120bdd1243dSDimitry Andric   return BCResumeVal;
3121bdd1243dSDimitry Andric }
3122e8d8bef9SDimitry Andric 
3123fe013be4SDimitry Andric /// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
3124fe013be4SDimitry Andric /// expansion results.
getExpandedStep(const InductionDescriptor & ID,const SCEV2ValueTy & ExpandedSCEVs)3125fe013be4SDimitry Andric static Value *getExpandedStep(const InductionDescriptor &ID,
3126fe013be4SDimitry Andric                               const SCEV2ValueTy &ExpandedSCEVs) {
3127fe013be4SDimitry Andric   const SCEV *Step = ID.getStep();
3128fe013be4SDimitry Andric   if (auto *C = dyn_cast<SCEVConstant>(Step))
3129fe013be4SDimitry Andric     return C->getValue();
3130fe013be4SDimitry Andric   if (auto *U = dyn_cast<SCEVUnknown>(Step))
3131fe013be4SDimitry Andric     return U->getValue();
3132fe013be4SDimitry Andric   auto I = ExpandedSCEVs.find(Step);
3133fe013be4SDimitry Andric   assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point");
3134fe013be4SDimitry Andric   return I->second;
3135fe013be4SDimitry Andric }
3136fe013be4SDimitry Andric 
createInductionResumeValues(const SCEV2ValueTy & ExpandedSCEVs,std::pair<BasicBlock *,Value * > AdditionalBypass)3137bdd1243dSDimitry Andric void InnerLoopVectorizer::createInductionResumeValues(
3138fe013be4SDimitry Andric     const SCEV2ValueTy &ExpandedSCEVs,
3139bdd1243dSDimitry Andric     std::pair<BasicBlock *, Value *> AdditionalBypass) {
3140bdd1243dSDimitry Andric   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3141bdd1243dSDimitry Andric           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3142bdd1243dSDimitry Andric          "Inconsistent information about additional bypass.");
3143bdd1243dSDimitry Andric   // We are going to resume the execution of the scalar loop.
3144bdd1243dSDimitry Andric   // Go over all of the induction variables that we found and fix the
3145bdd1243dSDimitry Andric   // PHIs that are left in the scalar version of the loop.
3146bdd1243dSDimitry Andric   // The starting values of PHI nodes depend on the counter of the last
3147bdd1243dSDimitry Andric   // iteration in the vectorized loop.
3148bdd1243dSDimitry Andric   // If we come from a bypass edge then we need to start from the original
3149bdd1243dSDimitry Andric   // start value.
3150bdd1243dSDimitry Andric   for (const auto &InductionEntry : Legal->getInductionVars()) {
3151bdd1243dSDimitry Andric     PHINode *OrigPhi = InductionEntry.first;
3152bdd1243dSDimitry Andric     const InductionDescriptor &II = InductionEntry.second;
3153bdd1243dSDimitry Andric     PHINode *BCResumeVal = createInductionResumeValue(
3154fe013be4SDimitry Andric         OrigPhi, II, getExpandedStep(II, ExpandedSCEVs), LoopBypassBlocks,
3155fe013be4SDimitry Andric         AdditionalBypass);
3156e8d8bef9SDimitry Andric     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3157e8d8bef9SDimitry Andric   }
3158e8d8bef9SDimitry Andric }
3159e8d8bef9SDimitry Andric 
completeLoopSkeleton()3160bdd1243dSDimitry Andric BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() {
3161e8d8bef9SDimitry Andric   // The trip counts should be cached by now.
3162fe013be4SDimitry Andric   Value *Count = getTripCount();
316381ad6265SDimitry Andric   Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3164e8d8bef9SDimitry Andric 
3165e8d8bef9SDimitry Andric   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3166e8d8bef9SDimitry Andric 
3167e8d8bef9SDimitry Andric   // Add a check in the middle block to see if we have completed
3168fe6060f1SDimitry Andric   // all of the iterations in the first vector loop.  Three cases:
3169fe6060f1SDimitry Andric   // 1) If we require a scalar epilogue, there is no conditional branch as
3170fe6060f1SDimitry Andric   //    we unconditionally branch to the scalar preheader.  Do nothing.
3171fe6060f1SDimitry Andric   // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3172fe6060f1SDimitry Andric   //    Thus if tail is to be folded, we know we don't need to run the
3173fe6060f1SDimitry Andric   //    remainder and we can use the previous value for the condition (true).
3174fe6060f1SDimitry Andric   // 3) Otherwise, construct a runtime check.
3175fe013be4SDimitry Andric   if (!Cost->requiresScalarEpilogue(VF.isVector()) &&
3176fe013be4SDimitry Andric       !Cost->foldTailByMasking()) {
3177e8d8bef9SDimitry Andric     // Here we use the same DebugLoc as the scalar loop latch terminator instead
3178e8d8bef9SDimitry Andric     // of the corresponding compare because they may have ended up with
3179e8d8bef9SDimitry Andric     // different line numbers and we want to avoid awkward line stepping while
3180e8d8bef9SDimitry Andric     // debugging. Eg. if the compare has got a line number inside the loop.
3181c9157d92SDimitry Andric     // TODO: At the moment, CreateICmpEQ will simplify conditions with constant
3182c9157d92SDimitry Andric     // operands. Perform simplification directly on VPlan once the branch is
3183c9157d92SDimitry Andric     // modeled there.
3184c9157d92SDimitry Andric     IRBuilder<> B(LoopMiddleBlock->getTerminator());
3185c9157d92SDimitry Andric     B.SetCurrentDebugLocation(ScalarLatchTerm->getDebugLoc());
3186c9157d92SDimitry Andric     Value *CmpN = B.CreateICmpEQ(Count, VectorTripCount, "cmp.n");
3187c9157d92SDimitry Andric     BranchInst &BI = *cast<BranchInst>(LoopMiddleBlock->getTerminator());
3188c9157d92SDimitry Andric     BI.setCondition(CmpN);
3189c9157d92SDimitry Andric     if (hasBranchWeightMD(*ScalarLatchTerm)) {
3190c9157d92SDimitry Andric       // Assume that `Count % VectorTripCount` is equally distributed.
3191c9157d92SDimitry Andric       unsigned TripCount = UF * VF.getKnownMinValue();
3192c9157d92SDimitry Andric       assert(TripCount > 0 && "trip count should not be zero");
3193c9157d92SDimitry Andric       const uint32_t Weights[] = {1, TripCount - 1};
3194c9157d92SDimitry Andric       setBranchWeights(BI, Weights);
3195c9157d92SDimitry Andric     }
3196e8d8bef9SDimitry Andric   }
3197e8d8bef9SDimitry Andric 
3198e8d8bef9SDimitry Andric #ifdef EXPENSIVE_CHECKS
3199e8d8bef9SDimitry Andric   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3200e8d8bef9SDimitry Andric #endif
3201e8d8bef9SDimitry Andric 
3202e8d8bef9SDimitry Andric   return LoopVectorPreHeader;
3203e8d8bef9SDimitry Andric }
3204e8d8bef9SDimitry Andric 
320504eeddc0SDimitry Andric std::pair<BasicBlock *, Value *>
createVectorizedLoopSkeleton(const SCEV2ValueTy & ExpandedSCEVs)3206fe013be4SDimitry Andric InnerLoopVectorizer::createVectorizedLoopSkeleton(
3207fe013be4SDimitry Andric     const SCEV2ValueTy &ExpandedSCEVs) {
32080b57cec5SDimitry Andric   /*
32090b57cec5SDimitry Andric    In this function we generate a new loop. The new loop will contain
32100b57cec5SDimitry Andric    the vectorized instructions while the old loop will continue to run the
32110b57cec5SDimitry Andric    scalar remainder.
32120b57cec5SDimitry Andric 
3213fe013be4SDimitry Andric        [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
3214fe013be4SDimitry Andric      /  |      preheader are expanded here. Eventually all required SCEV
3215fe013be4SDimitry Andric     /   |      expansion should happen here.
32160b57cec5SDimitry Andric    /    v
32170b57cec5SDimitry Andric   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
32180b57cec5SDimitry Andric   |  /  |
32190b57cec5SDimitry Andric   | /   v
32200b57cec5SDimitry Andric   ||   [ ]     <-- vector pre header.
32210b57cec5SDimitry Andric   |/    |
32220b57cec5SDimitry Andric   |     v
32230b57cec5SDimitry Andric   |    [  ] \
322481ad6265SDimitry Andric   |    [  ]_|   <-- vector loop (created during VPlan execution).
32250b57cec5SDimitry Andric   |     |
32260b57cec5SDimitry Andric   |     v
3227fe6060f1SDimitry Andric   \   -[ ]   <--- middle-block.
3228fe6060f1SDimitry Andric    \/   |
3229fe6060f1SDimitry Andric    /\   v
3230fe6060f1SDimitry Andric    | ->[ ]     <--- new preheader.
32310b57cec5SDimitry Andric    |    |
3232fe6060f1SDimitry Andric  (opt)  v      <-- edge from middle to exit iff epilogue is not required.
32330b57cec5SDimitry Andric    |   [ ] \
3234fe6060f1SDimitry Andric    |   [ ]_|   <-- old scalar loop to handle remainder (scalar epilogue).
32350b57cec5SDimitry Andric     \   |
32360b57cec5SDimitry Andric      \  v
3237fe6060f1SDimitry Andric       >[ ]     <-- exit block(s).
32380b57cec5SDimitry Andric    ...
32390b57cec5SDimitry Andric    */
32400b57cec5SDimitry Andric 
3241e8d8bef9SDimitry Andric   // Create an empty vector loop, and prepare basic blocks for the runtime
3242e8d8bef9SDimitry Andric   // checks.
324381ad6265SDimitry Andric   createVectorLoopSkeleton("");
32440b57cec5SDimitry Andric 
32450b57cec5SDimitry Andric   // Now, compare the new count to zero. If it is zero skip the vector loop and
32460b57cec5SDimitry Andric   // jump to the scalar loop. This check also covers the case where the
32470b57cec5SDimitry Andric   // backedge-taken count is uint##_max: adding one to it will overflow leading
32480b57cec5SDimitry Andric   // to an incorrect trip count of zero. In this (rare) case we will also jump
32490b57cec5SDimitry Andric   // to the scalar loop.
325081ad6265SDimitry Andric   emitIterationCountCheck(LoopScalarPreHeader);
32510b57cec5SDimitry Andric 
32520b57cec5SDimitry Andric   // Generate the code to check any assumptions that we've made for SCEV
32530b57cec5SDimitry Andric   // expressions.
325481ad6265SDimitry Andric   emitSCEVChecks(LoopScalarPreHeader);
32550b57cec5SDimitry Andric 
32560b57cec5SDimitry Andric   // Generate the code that checks in runtime if arrays overlap. We put the
32570b57cec5SDimitry Andric   // checks into a separate block to make the more common case of few elements
32580b57cec5SDimitry Andric   // faster.
325981ad6265SDimitry Andric   emitMemRuntimeChecks(LoopScalarPreHeader);
32600b57cec5SDimitry Andric 
3261e8d8bef9SDimitry Andric   // Emit phis for the new starting index of the scalar loop.
3262fe013be4SDimitry Andric   createInductionResumeValues(ExpandedSCEVs);
32630b57cec5SDimitry Andric 
3264bdd1243dSDimitry Andric   return {completeLoopSkeleton(), nullptr};
32650b57cec5SDimitry Andric }
32660b57cec5SDimitry Andric 
32670b57cec5SDimitry Andric // Fix up external users of the induction variable. At this point, we are
32680b57cec5SDimitry Andric // in LCSSA form, with all external PHIs that use the IV having one input value,
32690b57cec5SDimitry Andric // coming from the remainder loop. We need those PHIs to also have a correct
32700b57cec5SDimitry Andric // value for the IV when arriving directly from the middle block.
fixupIVUsers(PHINode * OrigPhi,const InductionDescriptor & II,Value * VectorTripCount,Value * EndValue,BasicBlock * MiddleBlock,BasicBlock * VectorHeader,VPlan & Plan,VPTransformState & State)32710b57cec5SDimitry Andric void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
32720b57cec5SDimitry Andric                                        const InductionDescriptor &II,
327381ad6265SDimitry Andric                                        Value *VectorTripCount, Value *EndValue,
327481ad6265SDimitry Andric                                        BasicBlock *MiddleBlock,
3275fe013be4SDimitry Andric                                        BasicBlock *VectorHeader, VPlan &Plan,
3276fe013be4SDimitry Andric                                        VPTransformState &State) {
32770b57cec5SDimitry Andric   // There are two kinds of external IV usages - those that use the value
32780b57cec5SDimitry Andric   // computed in the last iteration (the PHI) and those that use the penultimate
32790b57cec5SDimitry Andric   // value (the value that feeds into the phi from the loop latch).
32800b57cec5SDimitry Andric   // We allow both, but they, obviously, have different values.
32810b57cec5SDimitry Andric 
3282e8d8bef9SDimitry Andric   assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
32830b57cec5SDimitry Andric 
32840b57cec5SDimitry Andric   DenseMap<Value *, Value *> MissingVals;
32850b57cec5SDimitry Andric 
32860b57cec5SDimitry Andric   // An external user of the last iteration's value should see the value that
32870b57cec5SDimitry Andric   // the remainder loop uses to initialize its own IV.
32880b57cec5SDimitry Andric   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
32890b57cec5SDimitry Andric   for (User *U : PostInc->users()) {
32900b57cec5SDimitry Andric     Instruction *UI = cast<Instruction>(U);
32910b57cec5SDimitry Andric     if (!OrigLoop->contains(UI)) {
32920b57cec5SDimitry Andric       assert(isa<PHINode>(UI) && "Expected LCSSA form");
32930b57cec5SDimitry Andric       MissingVals[UI] = EndValue;
32940b57cec5SDimitry Andric     }
32950b57cec5SDimitry Andric   }
32960b57cec5SDimitry Andric 
32970b57cec5SDimitry Andric   // An external user of the penultimate value need to see EndValue - Step.
32980b57cec5SDimitry Andric   // The simplest way to get this is to recompute it from the constituent SCEVs,
32990b57cec5SDimitry Andric   // that is Start + (Step * (CRD - 1)).
33000b57cec5SDimitry Andric   for (User *U : OrigPhi->users()) {
33010b57cec5SDimitry Andric     auto *UI = cast<Instruction>(U);
33020b57cec5SDimitry Andric     if (!OrigLoop->contains(UI)) {
33030b57cec5SDimitry Andric       assert(isa<PHINode>(UI) && "Expected LCSSA form");
33040b57cec5SDimitry Andric       IRBuilder<> B(MiddleBlock->getTerminator());
3305fe6060f1SDimitry Andric 
3306fe6060f1SDimitry Andric       // Fast-math-flags propagate from the original induction instruction.
3307fe6060f1SDimitry Andric       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3308fe6060f1SDimitry Andric         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3309fe6060f1SDimitry Andric 
33100b57cec5SDimitry Andric       Value *CountMinusOne = B.CreateSub(
331181ad6265SDimitry Andric           VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1));
3312bdd1243dSDimitry Andric       CountMinusOne->setName("cmo");
3313fe013be4SDimitry Andric 
3314fe013be4SDimitry Andric       VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep());
3315fe013be4SDimitry Andric       assert(StepVPV && "step must have been expanded during VPlan execution");
3316fe013be4SDimitry Andric       Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()
3317fe013be4SDimitry Andric                                         : State.get(StepVPV, {0, 0});
33180eae32dcSDimitry Andric       Value *Escape =
3319c9157d92SDimitry Andric           emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step,
3320c9157d92SDimitry Andric                                II.getKind(), II.getInductionBinOp());
33210b57cec5SDimitry Andric       Escape->setName("ind.escape");
33220b57cec5SDimitry Andric       MissingVals[UI] = Escape;
33230b57cec5SDimitry Andric     }
33240b57cec5SDimitry Andric   }
33250b57cec5SDimitry Andric 
33260b57cec5SDimitry Andric   for (auto &I : MissingVals) {
33270b57cec5SDimitry Andric     PHINode *PHI = cast<PHINode>(I.first);
33280b57cec5SDimitry Andric     // One corner case we have to handle is two IVs "chasing" each-other,
33290b57cec5SDimitry Andric     // that is %IV2 = phi [...], [ %IV1, %latch ]
33300b57cec5SDimitry Andric     // In this case, if IV1 has an external use, we need to avoid adding both
33310b57cec5SDimitry Andric     // "last value of IV1" and "penultimate value of IV2". So, verify that we
33320b57cec5SDimitry Andric     // don't already have an incoming value for the middle block.
333381ad6265SDimitry Andric     if (PHI->getBasicBlockIndex(MiddleBlock) == -1) {
33340b57cec5SDimitry Andric       PHI->addIncoming(I.second, MiddleBlock);
333581ad6265SDimitry Andric       Plan.removeLiveOut(PHI);
333681ad6265SDimitry Andric     }
33370b57cec5SDimitry Andric   }
33380b57cec5SDimitry Andric }
33390b57cec5SDimitry Andric 
33400b57cec5SDimitry Andric namespace {
33410b57cec5SDimitry Andric 
33420b57cec5SDimitry Andric struct CSEDenseMapInfo {
canHandle__anona898e7630e11::CSEDenseMapInfo33430b57cec5SDimitry Andric   static bool canHandle(const Instruction *I) {
33440b57cec5SDimitry Andric     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
33450b57cec5SDimitry Andric            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
33460b57cec5SDimitry Andric   }
33470b57cec5SDimitry Andric 
getEmptyKey__anona898e7630e11::CSEDenseMapInfo33480b57cec5SDimitry Andric   static inline Instruction *getEmptyKey() {
33490b57cec5SDimitry Andric     return DenseMapInfo<Instruction *>::getEmptyKey();
33500b57cec5SDimitry Andric   }
33510b57cec5SDimitry Andric 
getTombstoneKey__anona898e7630e11::CSEDenseMapInfo33520b57cec5SDimitry Andric   static inline Instruction *getTombstoneKey() {
33530b57cec5SDimitry Andric     return DenseMapInfo<Instruction *>::getTombstoneKey();
33540b57cec5SDimitry Andric   }
33550b57cec5SDimitry Andric 
getHashValue__anona898e7630e11::CSEDenseMapInfo33560b57cec5SDimitry Andric   static unsigned getHashValue(const Instruction *I) {
33570b57cec5SDimitry Andric     assert(canHandle(I) && "Unknown instruction!");
33580b57cec5SDimitry Andric     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
33590b57cec5SDimitry Andric                                                            I->value_op_end()));
33600b57cec5SDimitry Andric   }
33610b57cec5SDimitry Andric 
isEqual__anona898e7630e11::CSEDenseMapInfo33620b57cec5SDimitry Andric   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
33630b57cec5SDimitry Andric     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
33640b57cec5SDimitry Andric         LHS == getTombstoneKey() || RHS == getTombstoneKey())
33650b57cec5SDimitry Andric       return LHS == RHS;
33660b57cec5SDimitry Andric     return LHS->isIdenticalTo(RHS);
33670b57cec5SDimitry Andric   }
33680b57cec5SDimitry Andric };
33690b57cec5SDimitry Andric 
33700b57cec5SDimitry Andric } // end anonymous namespace
33710b57cec5SDimitry Andric 
33720b57cec5SDimitry Andric ///Perform cse of induction variable instructions.
cse(BasicBlock * BB)33730b57cec5SDimitry Andric static void cse(BasicBlock *BB) {
33740b57cec5SDimitry Andric   // Perform simple cse.
33750b57cec5SDimitry Andric   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3376349cc55cSDimitry Andric   for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3377349cc55cSDimitry Andric     if (!CSEDenseMapInfo::canHandle(&In))
33780b57cec5SDimitry Andric       continue;
33790b57cec5SDimitry Andric 
33800b57cec5SDimitry Andric     // Check if we can replace this instruction with any of the
33810b57cec5SDimitry Andric     // visited instructions.
3382349cc55cSDimitry Andric     if (Instruction *V = CSEMap.lookup(&In)) {
3383349cc55cSDimitry Andric       In.replaceAllUsesWith(V);
3384349cc55cSDimitry Andric       In.eraseFromParent();
33850b57cec5SDimitry Andric       continue;
33860b57cec5SDimitry Andric     }
33870b57cec5SDimitry Andric 
3388349cc55cSDimitry Andric     CSEMap[&In] = &In;
33890b57cec5SDimitry Andric   }
33900b57cec5SDimitry Andric }
33910b57cec5SDimitry Andric 
3392c9157d92SDimitry Andric InstructionCost
getVectorCallCost(CallInst * CI,ElementCount VF) const3393c9157d92SDimitry Andric LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3394c9157d92SDimitry Andric                                               ElementCount VF) const {
3395c9157d92SDimitry Andric   // We only need to calculate a cost if the VF is scalar; for actual vectors
3396c9157d92SDimitry Andric   // we should already have a pre-calculated cost at each VF.
3397c9157d92SDimitry Andric   if (!VF.isScalar())
3398c9157d92SDimitry Andric     return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost;
33990b57cec5SDimitry Andric 
3400bdd1243dSDimitry Andric   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
3401c9157d92SDimitry Andric   Type *RetTy = CI->getType();
3402c9157d92SDimitry Andric   if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
3403c9157d92SDimitry Andric     if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind))
3404c9157d92SDimitry Andric       return *RedCost;
3405c9157d92SDimitry Andric 
3406c9157d92SDimitry Andric   SmallVector<Type *, 4> Tys;
3407c9157d92SDimitry Andric   for (auto &ArgOp : CI->args())
3408c9157d92SDimitry Andric     Tys.push_back(ArgOp->getType());
3409c9157d92SDimitry Andric 
3410e8d8bef9SDimitry Andric   InstructionCost ScalarCallCost =
3411c9157d92SDimitry Andric       TTI.getCallInstrCost(CI->getCalledFunction(), RetTy, Tys, CostKind);
3412c9157d92SDimitry Andric 
3413c9157d92SDimitry Andric   // If this is an intrinsic we may have a lower cost for it.
3414c9157d92SDimitry Andric   if (getVectorIntrinsicIDForCall(CI, TLI)) {
3415c9157d92SDimitry Andric     InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
3416c9157d92SDimitry Andric     return std::min(ScalarCallCost, IntrinsicCost);
3417c9157d92SDimitry Andric   }
34180b57cec5SDimitry Andric   return ScalarCallCost;
34190b57cec5SDimitry Andric }
34200b57cec5SDimitry Andric 
MaybeVectorizeType(Type * Elt,ElementCount VF)3421fe6060f1SDimitry Andric static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3422fe6060f1SDimitry Andric   if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3423fe6060f1SDimitry Andric     return Elt;
3424fe6060f1SDimitry Andric   return VectorType::get(Elt, VF);
3425fe6060f1SDimitry Andric }
3426fe6060f1SDimitry Andric 
3427e8d8bef9SDimitry Andric InstructionCost
getVectorIntrinsicCost(CallInst * CI,ElementCount VF) const3428e8d8bef9SDimitry Andric LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3429fe6060f1SDimitry Andric                                                    ElementCount VF) const {
34300b57cec5SDimitry Andric   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
34310b57cec5SDimitry Andric   assert(ID && "Expected intrinsic call!");
3432fe6060f1SDimitry Andric   Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3433fe6060f1SDimitry Andric   FastMathFlags FMF;
3434fe6060f1SDimitry Andric   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3435fe6060f1SDimitry Andric     FMF = FPMO->getFastMathFlags();
34360b57cec5SDimitry Andric 
3437349cc55cSDimitry Andric   SmallVector<const Value *> Arguments(CI->args());
3438fe6060f1SDimitry Andric   FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3439fe6060f1SDimitry Andric   SmallVector<Type *> ParamTys;
3440fe6060f1SDimitry Andric   std::transform(FTy->param_begin(), FTy->param_end(),
3441fe6060f1SDimitry Andric                  std::back_inserter(ParamTys),
3442fe6060f1SDimitry Andric                  [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3443fe6060f1SDimitry Andric 
3444fe6060f1SDimitry Andric   IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3445fe6060f1SDimitry Andric                                     dyn_cast<IntrinsicInst>(CI));
34465ffd83dbSDimitry Andric   return TTI.getIntrinsicInstrCost(CostAttrs,
34475ffd83dbSDimitry Andric                                    TargetTransformInfo::TCK_RecipThroughput);
34480b57cec5SDimitry Andric }
34490b57cec5SDimitry Andric 
smallestIntegerVectorType(Type * T1,Type * T2)34500b57cec5SDimitry Andric static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
34515ffd83dbSDimitry Andric   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
34525ffd83dbSDimitry Andric   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
34530b57cec5SDimitry Andric   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
34540b57cec5SDimitry Andric }
34555ffd83dbSDimitry Andric 
largestIntegerVectorType(Type * T1,Type * T2)34560b57cec5SDimitry Andric static Type *largestIntegerVectorType(Type *T1, Type *T2) {
34575ffd83dbSDimitry Andric   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
34585ffd83dbSDimitry Andric   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
34590b57cec5SDimitry Andric   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
34600b57cec5SDimitry Andric }
34610b57cec5SDimitry Andric 
fixVectorizedLoop(VPTransformState & State,VPlan & Plan)346281ad6265SDimitry Andric void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
346381ad6265SDimitry Andric                                             VPlan &Plan) {
34640b57cec5SDimitry Andric   // Fix widened non-induction PHIs by setting up the PHI operands.
346581ad6265SDimitry Andric   if (EnableVPlanNativePath)
346681ad6265SDimitry Andric     fixNonInductionPHIs(Plan, State);
34670b57cec5SDimitry Andric 
34680b57cec5SDimitry Andric   // At this point every instruction in the original loop is widened to a
34690b57cec5SDimitry Andric   // vector form. Now we need to fix the recurrences in the loop. These PHI
34700b57cec5SDimitry Andric   // nodes are currently empty because we did not want to introduce cycles.
3471cdc20ff6SDimitry Andric   // This is the second stage of vectorizing recurrences. Note that fixing
3472cdc20ff6SDimitry Andric   // reduction phis are already modeled in VPlan.
3473cdc20ff6SDimitry Andric   // TODO: Also model fixing fixed-order recurrence phis in VPlan.
3474cdc20ff6SDimitry Andric   VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
3475cdc20ff6SDimitry Andric   VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock();
3476cdc20ff6SDimitry Andric   for (VPRecipeBase &R : HeaderVPBB->phis()) {
3477cdc20ff6SDimitry Andric     if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
3478cdc20ff6SDimitry Andric       fixFixedOrderRecurrence(FOR, State);
3479cdc20ff6SDimitry Andric   }
34800b57cec5SDimitry Andric 
3481480093f4SDimitry Andric   // Forget the original basic block.
3482480093f4SDimitry Andric   PSE.getSE()->forgetLoop(OrigLoop);
3483c9157d92SDimitry Andric   PSE.getSE()->forgetBlockAndLoopDispositions();
34840b57cec5SDimitry Andric 
3485fe013be4SDimitry Andric   // After vectorization, the exit blocks of the original loop will have
3486fe013be4SDimitry Andric   // additional predecessors. Invalidate SCEVs for the exit phis in case SE
3487fe013be4SDimitry Andric   // looked through single-entry phis.
3488fe013be4SDimitry Andric   SmallVector<BasicBlock *> ExitBlocks;
3489fe013be4SDimitry Andric   OrigLoop->getExitBlocks(ExitBlocks);
3490fe013be4SDimitry Andric   for (BasicBlock *Exit : ExitBlocks)
3491fe013be4SDimitry Andric     for (PHINode &PN : Exit->phis())
3492c9157d92SDimitry Andric       PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN);
3493fe013be4SDimitry Andric 
3494cdc20ff6SDimitry Andric   VPBasicBlock *LatchVPBB = VectorRegion->getExitingBasicBlock();
349581ad6265SDimitry Andric   Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
3496fe013be4SDimitry Andric   if (Cost->requiresScalarEpilogue(VF.isVector())) {
349781ad6265SDimitry Andric     // No edge from the middle block to the unique exit block has been inserted
349881ad6265SDimitry Andric     // and there is nothing to fix from vector loop; phis should have incoming
349981ad6265SDimitry Andric     // from scalar loop only.
350081ad6265SDimitry Andric   } else {
3501fe013be4SDimitry Andric     // TODO: Check VPLiveOuts to see if IV users need fixing instead of checking
3502fe013be4SDimitry Andric     // the cost model.
3503fe013be4SDimitry Andric 
3504fe6060f1SDimitry Andric     // If we inserted an edge from the middle block to the unique exit block,
3505fe6060f1SDimitry Andric     // update uses outside the loop (phis) to account for the newly inserted
3506fe6060f1SDimitry Andric     // edge.
350781ad6265SDimitry Andric 
35080b57cec5SDimitry Andric     // Fix-up external users of the induction variables.
3509bdd1243dSDimitry Andric     for (const auto &Entry : Legal->getInductionVars())
35100b57cec5SDimitry Andric       fixupIVUsers(Entry.first, Entry.second,
351181ad6265SDimitry Andric                    getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()),
351281ad6265SDimitry Andric                    IVEndValues[Entry.first], LoopMiddleBlock,
3513fe013be4SDimitry Andric                    VectorLoop->getHeader(), Plan, State);
3514fe6060f1SDimitry Andric   }
3515fe6060f1SDimitry Andric 
351681ad6265SDimitry Andric   // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated
351781ad6265SDimitry Andric   // in the exit block, so update the builder.
3518c9157d92SDimitry Andric   State.Builder.SetInsertPoint(State.CFG.ExitBB,
3519c9157d92SDimitry Andric                                State.CFG.ExitBB->getFirstNonPHIIt());
3520bdd1243dSDimitry Andric   for (const auto &KV : Plan.getLiveOuts())
352181ad6265SDimitry Andric     KV.second->fixPhi(Plan, State);
352281ad6265SDimitry Andric 
35230b57cec5SDimitry Andric   for (Instruction *PI : PredicatedInstructions)
35240b57cec5SDimitry Andric     sinkScalarOperands(&*PI);
35250b57cec5SDimitry Andric 
35260b57cec5SDimitry Andric   // Remove redundant induction instructions.
352781ad6265SDimitry Andric   cse(VectorLoop->getHeader());
35285ffd83dbSDimitry Andric 
35295ffd83dbSDimitry Andric   // Set/update profile weights for the vector and remainder loops as original
35305ffd83dbSDimitry Andric   // loop iterations are now distributed among them. Note that original loop
35315ffd83dbSDimitry Andric   // represented by LoopScalarBody becomes remainder loop after vectorization.
35325ffd83dbSDimitry Andric   //
35335ffd83dbSDimitry Andric   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
35345ffd83dbSDimitry Andric   // end up getting slightly roughened result but that should be OK since
35355ffd83dbSDimitry Andric   // profile is not inherently precise anyway. Note also possible bypass of
35365ffd83dbSDimitry Andric   // vector code caused by legality checks is ignored, assigning all the weight
35375ffd83dbSDimitry Andric   // to the vector loop, optimistically.
3538e8d8bef9SDimitry Andric   //
3539e8d8bef9SDimitry Andric   // For scalable vectorization we can't know at compile time how many iterations
3540e8d8bef9SDimitry Andric   // of the loop are handled in one vector iteration, so instead assume a pessimistic
3541e8d8bef9SDimitry Andric   // vscale of '1'.
354281ad6265SDimitry Andric   setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop,
354381ad6265SDimitry Andric                                LI->getLoopFor(LoopScalarBody),
354481ad6265SDimitry Andric                                VF.getKnownMinValue() * UF);
35450b57cec5SDimitry Andric }
35460b57cec5SDimitry Andric 
fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe * PhiR,VPTransformState & State)3547bdd1243dSDimitry Andric void InnerLoopVectorizer::fixFixedOrderRecurrence(
354804eeddc0SDimitry Andric     VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {
35490b57cec5SDimitry Andric   // This is the second phase of vectorizing first-order recurrences. An
35500b57cec5SDimitry Andric   // overview of the transformation is described below. Suppose we have the
35510b57cec5SDimitry Andric   // following loop.
35520b57cec5SDimitry Andric   //
35530b57cec5SDimitry Andric   //   for (int i = 0; i < n; ++i)
35540b57cec5SDimitry Andric   //     b[i] = a[i] - a[i - 1];
35550b57cec5SDimitry Andric   //
35560b57cec5SDimitry Andric   // There is a first-order recurrence on "a". For this loop, the shorthand
35570b57cec5SDimitry Andric   // scalar IR looks like:
35580b57cec5SDimitry Andric   //
35590b57cec5SDimitry Andric   //   scalar.ph:
35600b57cec5SDimitry Andric   //     s_init = a[-1]
35610b57cec5SDimitry Andric   //     br scalar.body
35620b57cec5SDimitry Andric   //
35630b57cec5SDimitry Andric   //   scalar.body:
35640b57cec5SDimitry Andric   //     i = phi [0, scalar.ph], [i+1, scalar.body]
35650b57cec5SDimitry Andric   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
35660b57cec5SDimitry Andric   //     s2 = a[i]
35670b57cec5SDimitry Andric   //     b[i] = s2 - s1
35680b57cec5SDimitry Andric   //     br cond, scalar.body, ...
35690b57cec5SDimitry Andric   //
35700b57cec5SDimitry Andric   // In this example, s1 is a recurrence because it's value depends on the
35710b57cec5SDimitry Andric   // previous iteration. In the first phase of vectorization, we created a
3572fe6060f1SDimitry Andric   // vector phi v1 for s1. We now complete the vectorization and produce the
35730b57cec5SDimitry Andric   // shorthand vector IR shown below (for VF = 4, UF = 1).
35740b57cec5SDimitry Andric   //
35750b57cec5SDimitry Andric   //   vector.ph:
35760b57cec5SDimitry Andric   //     v_init = vector(..., ..., ..., a[-1])
35770b57cec5SDimitry Andric   //     br vector.body
35780b57cec5SDimitry Andric   //
35790b57cec5SDimitry Andric   //   vector.body
35800b57cec5SDimitry Andric   //     i = phi [0, vector.ph], [i+4, vector.body]
35810b57cec5SDimitry Andric   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
35820b57cec5SDimitry Andric   //     v2 = a[i, i+1, i+2, i+3];
35830b57cec5SDimitry Andric   //     v3 = vector(v1(3), v2(0, 1, 2))
35840b57cec5SDimitry Andric   //     b[i, i+1, i+2, i+3] = v2 - v3
35850b57cec5SDimitry Andric   //     br cond, vector.body, middle.block
35860b57cec5SDimitry Andric   //
35870b57cec5SDimitry Andric   //   middle.block:
35880b57cec5SDimitry Andric   //     x = v2(3)
35890b57cec5SDimitry Andric   //     br scalar.ph
35900b57cec5SDimitry Andric   //
35910b57cec5SDimitry Andric   //   scalar.ph:
35920b57cec5SDimitry Andric   //     s_init = phi [x, middle.block], [a[-1], otherwise]
35930b57cec5SDimitry Andric   //     br scalar.body
35940b57cec5SDimitry Andric   //
35950b57cec5SDimitry Andric   // After execution completes the vector loop, we extract the next value of
35960b57cec5SDimitry Andric   // the recurrence (x) to use as the initial value in the scalar loop.
35970b57cec5SDimitry Andric 
35980b57cec5SDimitry Andric   // Extract the last vector element in the middle block. This will be the
35990b57cec5SDimitry Andric   // initial value for the recurrence when jumping to the scalar loop.
3600349cc55cSDimitry Andric   VPValue *PreviousDef = PhiR->getBackedgeValue();
3601349cc55cSDimitry Andric   Value *Incoming = State.get(PreviousDef, UF - 1);
36020b57cec5SDimitry Andric   auto *ExtractForScalar = Incoming;
3603349cc55cSDimitry Andric   auto *IdxTy = Builder.getInt32Ty();
3604fe013be4SDimitry Andric   Value *RuntimeVF = nullptr;
3605e8d8bef9SDimitry Andric   if (VF.isVector()) {
3606fe6060f1SDimitry Andric     auto *One = ConstantInt::get(IdxTy, 1);
36070b57cec5SDimitry Andric     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3608fe013be4SDimitry Andric     RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3609fe6060f1SDimitry Andric     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
3610fe013be4SDimitry Andric     ExtractForScalar =
3611fe013be4SDimitry Andric         Builder.CreateExtractElement(Incoming, LastIdx, "vector.recur.extract");
36120b57cec5SDimitry Andric   }
3613fe013be4SDimitry Andric 
3614fe013be4SDimitry Andric   auto RecurSplice = cast<VPInstruction>(*PhiR->user_begin());
3615fe013be4SDimitry Andric   assert(PhiR->getNumUsers() == 1 &&
3616fe013be4SDimitry Andric          RecurSplice->getOpcode() ==
3617fe013be4SDimitry Andric              VPInstruction::FirstOrderRecurrenceSplice &&
3618fe013be4SDimitry Andric          "recurrence phi must have a single user: FirstOrderRecurrenceSplice");
3619fe013be4SDimitry Andric   SmallVector<VPLiveOut *> LiveOuts;
3620fe013be4SDimitry Andric   for (VPUser *U : RecurSplice->users())
3621fe013be4SDimitry Andric     if (auto *LiveOut = dyn_cast<VPLiveOut>(U))
3622fe013be4SDimitry Andric       LiveOuts.push_back(LiveOut);
3623fe013be4SDimitry Andric 
3624fe013be4SDimitry Andric   if (!LiveOuts.empty()) {
36250b57cec5SDimitry Andric     // Extract the second last element in the middle block if the
36260b57cec5SDimitry Andric     // Phi is used outside the loop. We need to extract the phi itself
36270b57cec5SDimitry Andric     // and not the last element (the phi update in the current iteration). This
3628fe013be4SDimitry Andric     // will be the value when jumping to the exit block from the
3629fe013be4SDimitry Andric     // LoopMiddleBlock, when the scalar loop is not run at all.
36300b57cec5SDimitry Andric     Value *ExtractForPhiUsedOutsideLoop = nullptr;
3631fe6060f1SDimitry Andric     if (VF.isVector()) {
3632fe6060f1SDimitry Andric       auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
36330b57cec5SDimitry Andric       ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3634fe6060f1SDimitry Andric           Incoming, Idx, "vector.recur.extract.for.phi");
3635fe013be4SDimitry Andric     } else {
3636fe013be4SDimitry Andric       assert(UF > 1 && "VF and UF cannot both be 1");
36370b57cec5SDimitry Andric       // When loop is unrolled without vectorizing, initialize
3638fe013be4SDimitry Andric       // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled
3639fe013be4SDimitry Andric       // value of `Incoming`. This is analogous to the vectorized case above:
3640fe013be4SDimitry Andric       // extracting the second last element when VF > 1.
3641fe6060f1SDimitry Andric       ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
3642fe013be4SDimitry Andric     }
3643fe013be4SDimitry Andric 
3644fe013be4SDimitry Andric     for (VPLiveOut *LiveOut : LiveOuts) {
3645fe013be4SDimitry Andric       assert(!Cost->requiresScalarEpilogue(VF.isVector()));
3646fe013be4SDimitry Andric       PHINode *LCSSAPhi = LiveOut->getPhi();
3647fe013be4SDimitry Andric       LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3648fe013be4SDimitry Andric       State.Plan->removeLiveOut(LCSSAPhi);
3649fe013be4SDimitry Andric     }
3650fe013be4SDimitry Andric   }
36510b57cec5SDimitry Andric 
36520b57cec5SDimitry Andric   // Fix the initial value of the original recurrence in the scalar loop.
3653c9157d92SDimitry Andric   Builder.SetInsertPoint(LoopScalarPreHeader, LoopScalarPreHeader->begin());
3654fe6060f1SDimitry Andric   PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
36550b57cec5SDimitry Andric   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3656fe6060f1SDimitry Andric   auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
36570b57cec5SDimitry Andric   for (auto *BB : predecessors(LoopScalarPreHeader)) {
36580b57cec5SDimitry Andric     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
36590b57cec5SDimitry Andric     Start->addIncoming(Incoming, BB);
36600b57cec5SDimitry Andric   }
36610b57cec5SDimitry Andric 
36620b57cec5SDimitry Andric   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
36630b57cec5SDimitry Andric   Phi->setName("scalar.recur");
36640b57cec5SDimitry Andric }
36650b57cec5SDimitry Andric 
sinkScalarOperands(Instruction * PredInst)36660b57cec5SDimitry Andric void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
36670b57cec5SDimitry Andric   // The basic block and loop containing the predicated instruction.
36680b57cec5SDimitry Andric   auto *PredBB = PredInst->getParent();
36690b57cec5SDimitry Andric   auto *VectorLoop = LI->getLoopFor(PredBB);
36700b57cec5SDimitry Andric 
36710b57cec5SDimitry Andric   // Initialize a worklist with the operands of the predicated instruction.
36720b57cec5SDimitry Andric   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
36730b57cec5SDimitry Andric 
36740b57cec5SDimitry Andric   // Holds instructions that we need to analyze again. An instruction may be
36750b57cec5SDimitry Andric   // reanalyzed if we don't yet know if we can sink it or not.
36760b57cec5SDimitry Andric   SmallVector<Instruction *, 8> InstsToReanalyze;
36770b57cec5SDimitry Andric 
36780b57cec5SDimitry Andric   // Returns true if a given use occurs in the predicated block. Phi nodes use
36790b57cec5SDimitry Andric   // their operands in their corresponding predecessor blocks.
36800b57cec5SDimitry Andric   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
36810b57cec5SDimitry Andric     auto *I = cast<Instruction>(U.getUser());
36820b57cec5SDimitry Andric     BasicBlock *BB = I->getParent();
36830b57cec5SDimitry Andric     if (auto *Phi = dyn_cast<PHINode>(I))
36840b57cec5SDimitry Andric       BB = Phi->getIncomingBlock(
36850b57cec5SDimitry Andric           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
36860b57cec5SDimitry Andric     return BB == PredBB;
36870b57cec5SDimitry Andric   };
36880b57cec5SDimitry Andric 
36890b57cec5SDimitry Andric   // Iteratively sink the scalarized operands of the predicated instruction
36900b57cec5SDimitry Andric   // into the block we created for it. When an instruction is sunk, it's
36910b57cec5SDimitry Andric   // operands are then added to the worklist. The algorithm ends after one pass
36920b57cec5SDimitry Andric   // through the worklist doesn't sink a single instruction.
36930b57cec5SDimitry Andric   bool Changed;
36940b57cec5SDimitry Andric   do {
36950b57cec5SDimitry Andric     // Add the instructions that need to be reanalyzed to the worklist, and
36960b57cec5SDimitry Andric     // reset the changed indicator.
36970b57cec5SDimitry Andric     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
36980b57cec5SDimitry Andric     InstsToReanalyze.clear();
36990b57cec5SDimitry Andric     Changed = false;
37000b57cec5SDimitry Andric 
37010b57cec5SDimitry Andric     while (!Worklist.empty()) {
37020b57cec5SDimitry Andric       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
37030b57cec5SDimitry Andric 
3704fe6060f1SDimitry Andric       // We can't sink an instruction if it is a phi node, is not in the loop,
3705fe013be4SDimitry Andric       // may have side effects or may read from memory.
3706fe013be4SDimitry Andric       // TODO Could dor more granular checking to allow sinking a load past non-store instructions.
3707fe6060f1SDimitry Andric       if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
3708fe013be4SDimitry Andric           I->mayHaveSideEffects() || I->mayReadFromMemory())
37090b57cec5SDimitry Andric           continue;
37100b57cec5SDimitry Andric 
3711fe6060f1SDimitry Andric       // If the instruction is already in PredBB, check if we can sink its
3712fe6060f1SDimitry Andric       // operands. In that case, VPlan's sinkScalarOperands() succeeded in
3713fe6060f1SDimitry Andric       // sinking the scalar instruction I, hence it appears in PredBB; but it
3714fe6060f1SDimitry Andric       // may have failed to sink I's operands (recursively), which we try
3715fe6060f1SDimitry Andric       // (again) here.
3716fe6060f1SDimitry Andric       if (I->getParent() == PredBB) {
3717fe6060f1SDimitry Andric         Worklist.insert(I->op_begin(), I->op_end());
3718fe6060f1SDimitry Andric         continue;
3719fe6060f1SDimitry Andric       }
3720fe6060f1SDimitry Andric 
37210b57cec5SDimitry Andric       // It's legal to sink the instruction if all its uses occur in the
37220b57cec5SDimitry Andric       // predicated block. Otherwise, there's nothing to do yet, and we may
37230b57cec5SDimitry Andric       // need to reanalyze the instruction.
37240b57cec5SDimitry Andric       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
37250b57cec5SDimitry Andric         InstsToReanalyze.push_back(I);
37260b57cec5SDimitry Andric         continue;
37270b57cec5SDimitry Andric       }
37280b57cec5SDimitry Andric 
37290b57cec5SDimitry Andric       // Move the instruction to the beginning of the predicated block, and add
37300b57cec5SDimitry Andric       // it's operands to the worklist.
37310b57cec5SDimitry Andric       I->moveBefore(&*PredBB->getFirstInsertionPt());
37320b57cec5SDimitry Andric       Worklist.insert(I->op_begin(), I->op_end());
37330b57cec5SDimitry Andric 
37340b57cec5SDimitry Andric       // The sinking may have enabled other instructions to be sunk, so we will
37350b57cec5SDimitry Andric       // need to iterate.
37360b57cec5SDimitry Andric       Changed = true;
37370b57cec5SDimitry Andric     }
37380b57cec5SDimitry Andric   } while (Changed);
37390b57cec5SDimitry Andric }
37400b57cec5SDimitry Andric 
fixNonInductionPHIs(VPlan & Plan,VPTransformState & State)374181ad6265SDimitry Andric void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan,
374281ad6265SDimitry Andric                                               VPTransformState &State) {
3743bdd1243dSDimitry Andric   auto Iter = vp_depth_first_deep(Plan.getEntry());
374481ad6265SDimitry Andric   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
374581ad6265SDimitry Andric     for (VPRecipeBase &P : VPBB->phis()) {
374681ad6265SDimitry Andric       VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
374781ad6265SDimitry Andric       if (!VPPhi)
374881ad6265SDimitry Andric         continue;
3749fe6060f1SDimitry Andric       PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
3750fe6060f1SDimitry Andric       // Make sure the builder has a valid insert point.
37510b57cec5SDimitry Andric       Builder.SetInsertPoint(NewPhi);
3752fe6060f1SDimitry Andric       for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
3753fe6060f1SDimitry Andric         VPValue *Inc = VPPhi->getIncomingValue(i);
3754fe6060f1SDimitry Andric         VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
3755fe6060f1SDimitry Andric         NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
37560b57cec5SDimitry Andric       }
37570b57cec5SDimitry Andric     }
37580b57cec5SDimitry Andric   }
375981ad6265SDimitry Andric }
37600b57cec5SDimitry Andric 
useOrderedReductions(const RecurrenceDescriptor & RdxDesc)37610eae32dcSDimitry Andric bool InnerLoopVectorizer::useOrderedReductions(
37620eae32dcSDimitry Andric     const RecurrenceDescriptor &RdxDesc) {
3763fe6060f1SDimitry Andric   return Cost->useOrderedReductions(RdxDesc);
3764fe6060f1SDimitry Andric }
3765fe6060f1SDimitry Andric 
collectLoopScalars(ElementCount VF)3766e8d8bef9SDimitry Andric void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
37670b57cec5SDimitry Andric   // We should not collect Scalars more than once per VF. Right now, this
37680b57cec5SDimitry Andric   // function is called from collectUniformsAndScalars(), which already does
37690b57cec5SDimitry Andric   // this check. Collecting Scalars for VF=1 does not make any sense.
3770fe013be4SDimitry Andric   assert(VF.isVector() && !Scalars.contains(VF) &&
37710b57cec5SDimitry Andric          "This function should not be visited twice for the same VF");
37720b57cec5SDimitry Andric 
377381ad6265SDimitry Andric   // This avoids any chances of creating a REPLICATE recipe during planning
377481ad6265SDimitry Andric   // since that would result in generation of scalarized code during execution,
377581ad6265SDimitry Andric   // which is not supported for scalable vectors.
377681ad6265SDimitry Andric   if (VF.isScalable()) {
377781ad6265SDimitry Andric     Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
377881ad6265SDimitry Andric     return;
377981ad6265SDimitry Andric   }
378081ad6265SDimitry Andric 
37810b57cec5SDimitry Andric   SmallSetVector<Instruction *, 8> Worklist;
37820b57cec5SDimitry Andric 
37830b57cec5SDimitry Andric   // These sets are used to seed the analysis with pointers used by memory
37840b57cec5SDimitry Andric   // accesses that will remain scalar.
37850b57cec5SDimitry Andric   SmallSetVector<Instruction *, 8> ScalarPtrs;
37860b57cec5SDimitry Andric   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
3787e8d8bef9SDimitry Andric   auto *Latch = TheLoop->getLoopLatch();
37880b57cec5SDimitry Andric 
37890b57cec5SDimitry Andric   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
37900b57cec5SDimitry Andric   // The pointer operands of loads and stores will be scalar as long as the
37910b57cec5SDimitry Andric   // memory access is not a gather or scatter operation. The value operand of a
37920b57cec5SDimitry Andric   // store will remain scalar if the store is scalarized.
37930b57cec5SDimitry Andric   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
37940b57cec5SDimitry Andric     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
37950b57cec5SDimitry Andric     assert(WideningDecision != CM_Unknown &&
37960b57cec5SDimitry Andric            "Widening decision should be ready at this moment");
37970b57cec5SDimitry Andric     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
37980b57cec5SDimitry Andric       if (Ptr == Store->getValueOperand())
37990b57cec5SDimitry Andric         return WideningDecision == CM_Scalarize;
38000b57cec5SDimitry Andric     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
38010b57cec5SDimitry Andric            "Ptr is neither a value or pointer operand");
38020b57cec5SDimitry Andric     return WideningDecision != CM_GatherScatter;
38030b57cec5SDimitry Andric   };
38040b57cec5SDimitry Andric 
38050b57cec5SDimitry Andric   // A helper that returns true if the given value is a bitcast or
38060b57cec5SDimitry Andric   // getelementptr instruction contained in the loop.
38070b57cec5SDimitry Andric   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
38080b57cec5SDimitry Andric     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
38090b57cec5SDimitry Andric             isa<GetElementPtrInst>(V)) &&
38100b57cec5SDimitry Andric            !TheLoop->isLoopInvariant(V);
38110b57cec5SDimitry Andric   };
38120b57cec5SDimitry Andric 
38134824e7fdSDimitry Andric   // A helper that evaluates a memory access's use of a pointer. If the use will
38144824e7fdSDimitry Andric   // be a scalar use and the pointer is only used by memory accesses, we place
38154824e7fdSDimitry Andric   // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
38164824e7fdSDimitry Andric   // PossibleNonScalarPtrs.
38170b57cec5SDimitry Andric   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
38180b57cec5SDimitry Andric     // We only care about bitcast and getelementptr instructions contained in
38190b57cec5SDimitry Andric     // the loop.
38200b57cec5SDimitry Andric     if (!isLoopVaryingBitCastOrGEP(Ptr))
38210b57cec5SDimitry Andric       return;
38220b57cec5SDimitry Andric 
38230b57cec5SDimitry Andric     // If the pointer has already been identified as scalar (e.g., if it was
38240b57cec5SDimitry Andric     // also identified as uniform), there's nothing to do.
38250b57cec5SDimitry Andric     auto *I = cast<Instruction>(Ptr);
38260b57cec5SDimitry Andric     if (Worklist.count(I))
38270b57cec5SDimitry Andric       return;
38280b57cec5SDimitry Andric 
3829349cc55cSDimitry Andric     // If the use of the pointer will be a scalar use, and all users of the
3830349cc55cSDimitry Andric     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
3831349cc55cSDimitry Andric     // place the pointer in PossibleNonScalarPtrs.
3832349cc55cSDimitry Andric     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
3833349cc55cSDimitry Andric           return isa<LoadInst>(U) || isa<StoreInst>(U);
38340b57cec5SDimitry Andric         }))
38350b57cec5SDimitry Andric       ScalarPtrs.insert(I);
38360b57cec5SDimitry Andric     else
38370b57cec5SDimitry Andric       PossibleNonScalarPtrs.insert(I);
38380b57cec5SDimitry Andric   };
38390b57cec5SDimitry Andric 
38400b57cec5SDimitry Andric   // We seed the scalars analysis with three classes of instructions: (1)
3841e8d8bef9SDimitry Andric   // instructions marked uniform-after-vectorization and (2) bitcast,
3842e8d8bef9SDimitry Andric   // getelementptr and (pointer) phi instructions used by memory accesses
3843e8d8bef9SDimitry Andric   // requiring a scalar use.
38440b57cec5SDimitry Andric   //
38450b57cec5SDimitry Andric   // (1) Add to the worklist all instructions that have been identified as
38460b57cec5SDimitry Andric   // uniform-after-vectorization.
38470b57cec5SDimitry Andric   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
38480b57cec5SDimitry Andric 
38490b57cec5SDimitry Andric   // (2) Add to the worklist all bitcast and getelementptr instructions used by
38500b57cec5SDimitry Andric   // memory accesses requiring a scalar use. The pointer operands of loads and
38510b57cec5SDimitry Andric   // stores will be scalar as long as the memory accesses is not a gather or
38520b57cec5SDimitry Andric   // scatter operation. The value operand of a store will remain scalar if the
38530b57cec5SDimitry Andric   // store is scalarized.
38540b57cec5SDimitry Andric   for (auto *BB : TheLoop->blocks())
38550b57cec5SDimitry Andric     for (auto &I : *BB) {
38560b57cec5SDimitry Andric       if (auto *Load = dyn_cast<LoadInst>(&I)) {
38570b57cec5SDimitry Andric         evaluatePtrUse(Load, Load->getPointerOperand());
38580b57cec5SDimitry Andric       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
38590b57cec5SDimitry Andric         evaluatePtrUse(Store, Store->getPointerOperand());
38600b57cec5SDimitry Andric         evaluatePtrUse(Store, Store->getValueOperand());
38610b57cec5SDimitry Andric       }
38620b57cec5SDimitry Andric     }
38630b57cec5SDimitry Andric   for (auto *I : ScalarPtrs)
38645ffd83dbSDimitry Andric     if (!PossibleNonScalarPtrs.count(I)) {
38650b57cec5SDimitry Andric       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
38660b57cec5SDimitry Andric       Worklist.insert(I);
38670b57cec5SDimitry Andric     }
38680b57cec5SDimitry Andric 
38690b57cec5SDimitry Andric   // Insert the forced scalars.
387081ad6265SDimitry Andric   // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
38710b57cec5SDimitry Andric   // induction variable when the PHI user is scalarized.
38720b57cec5SDimitry Andric   auto ForcedScalar = ForcedScalars.find(VF);
38730b57cec5SDimitry Andric   if (ForcedScalar != ForcedScalars.end())
3874bdd1243dSDimitry Andric     for (auto *I : ForcedScalar->second) {
3875bdd1243dSDimitry Andric       LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
38760b57cec5SDimitry Andric       Worklist.insert(I);
3877bdd1243dSDimitry Andric     }
38780b57cec5SDimitry Andric 
38790b57cec5SDimitry Andric   // Expand the worklist by looking through any bitcasts and getelementptr
38800b57cec5SDimitry Andric   // instructions we've already identified as scalar. This is similar to the
38810b57cec5SDimitry Andric   // expansion step in collectLoopUniforms(); however, here we're only
38820b57cec5SDimitry Andric   // expanding to include additional bitcasts and getelementptr instructions.
38830b57cec5SDimitry Andric   unsigned Idx = 0;
38840b57cec5SDimitry Andric   while (Idx != Worklist.size()) {
38850b57cec5SDimitry Andric     Instruction *Dst = Worklist[Idx++];
38860b57cec5SDimitry Andric     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
38870b57cec5SDimitry Andric       continue;
38880b57cec5SDimitry Andric     auto *Src = cast<Instruction>(Dst->getOperand(0));
38890b57cec5SDimitry Andric     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
38900b57cec5SDimitry Andric           auto *J = cast<Instruction>(U);
38910b57cec5SDimitry Andric           return !TheLoop->contains(J) || Worklist.count(J) ||
38920b57cec5SDimitry Andric                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
38930b57cec5SDimitry Andric                   isScalarUse(J, Src));
38940b57cec5SDimitry Andric         })) {
38950b57cec5SDimitry Andric       Worklist.insert(Src);
38960b57cec5SDimitry Andric       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
38970b57cec5SDimitry Andric     }
38980b57cec5SDimitry Andric   }
38990b57cec5SDimitry Andric 
39000b57cec5SDimitry Andric   // An induction variable will remain scalar if all users of the induction
39010b57cec5SDimitry Andric   // variable and induction variable update remain scalar.
3902bdd1243dSDimitry Andric   for (const auto &Induction : Legal->getInductionVars()) {
39030b57cec5SDimitry Andric     auto *Ind = Induction.first;
39040b57cec5SDimitry Andric     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
39050b57cec5SDimitry Andric 
39065ffd83dbSDimitry Andric     // If tail-folding is applied, the primary induction variable will be used
39075ffd83dbSDimitry Andric     // to feed a vector compare.
39085ffd83dbSDimitry Andric     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
39095ffd83dbSDimitry Andric       continue;
39105ffd83dbSDimitry Andric 
39114824e7fdSDimitry Andric     // Returns true if \p Indvar is a pointer induction that is used directly by
39124824e7fdSDimitry Andric     // load/store instruction \p I.
39134824e7fdSDimitry Andric     auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
39144824e7fdSDimitry Andric                                               Instruction *I) {
39154824e7fdSDimitry Andric       return Induction.second.getKind() ==
39164824e7fdSDimitry Andric                  InductionDescriptor::IK_PtrInduction &&
39174824e7fdSDimitry Andric              (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
39184824e7fdSDimitry Andric              Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
39194824e7fdSDimitry Andric     };
39204824e7fdSDimitry Andric 
39210b57cec5SDimitry Andric     // Determine if all users of the induction variable are scalar after
39220b57cec5SDimitry Andric     // vectorization.
39230b57cec5SDimitry Andric     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
39240b57cec5SDimitry Andric       auto *I = cast<Instruction>(U);
39254824e7fdSDimitry Andric       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
39264824e7fdSDimitry Andric              IsDirectLoadStoreFromPtrIndvar(Ind, I);
39270b57cec5SDimitry Andric     });
39280b57cec5SDimitry Andric     if (!ScalarInd)
39290b57cec5SDimitry Andric       continue;
39300b57cec5SDimitry Andric 
39310b57cec5SDimitry Andric     // Determine if all users of the induction variable update instruction are
39320b57cec5SDimitry Andric     // scalar after vectorization.
39330b57cec5SDimitry Andric     auto ScalarIndUpdate =
39340b57cec5SDimitry Andric         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
39350b57cec5SDimitry Andric           auto *I = cast<Instruction>(U);
39364824e7fdSDimitry Andric           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
39374824e7fdSDimitry Andric                  IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
39380b57cec5SDimitry Andric         });
39390b57cec5SDimitry Andric     if (!ScalarIndUpdate)
39400b57cec5SDimitry Andric       continue;
39410b57cec5SDimitry Andric 
39420b57cec5SDimitry Andric     // The induction variable and its update instruction will remain scalar.
39430b57cec5SDimitry Andric     Worklist.insert(Ind);
39440b57cec5SDimitry Andric     Worklist.insert(IndUpdate);
39450b57cec5SDimitry Andric     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
39460b57cec5SDimitry Andric     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
39470b57cec5SDimitry Andric                       << "\n");
39480b57cec5SDimitry Andric   }
39490b57cec5SDimitry Andric 
39500b57cec5SDimitry Andric   Scalars[VF].insert(Worklist.begin(), Worklist.end());
39510b57cec5SDimitry Andric }
39520b57cec5SDimitry Andric 
isScalarWithPredication(Instruction * I,ElementCount VF) const395304eeddc0SDimitry Andric bool LoopVectorizationCostModel::isScalarWithPredication(
395404eeddc0SDimitry Andric     Instruction *I, ElementCount VF) const {
3955bdd1243dSDimitry Andric   if (!isPredicatedInst(I))
39560b57cec5SDimitry Andric     return false;
3957bdd1243dSDimitry Andric 
3958bdd1243dSDimitry Andric   // Do we have a non-scalar lowering for this predicated
3959bdd1243dSDimitry Andric   // instruction? No - it is scalar with predication.
39600b57cec5SDimitry Andric   switch(I->getOpcode()) {
39610b57cec5SDimitry Andric   default:
3962bdd1243dSDimitry Andric     return true;
3963fe013be4SDimitry Andric   case Instruction::Call:
3964c9157d92SDimitry Andric     if (VF.isScalar())
3965c9157d92SDimitry Andric       return true;
3966c9157d92SDimitry Andric     return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF))
3967c9157d92SDimitry Andric                .Kind == CM_Scalarize;
39680b57cec5SDimitry Andric   case Instruction::Load:
39690b57cec5SDimitry Andric   case Instruction::Store: {
39700b57cec5SDimitry Andric     auto *Ptr = getLoadStorePointerOperand(I);
3971fe6060f1SDimitry Andric     auto *Ty = getLoadStoreType(I);
397204eeddc0SDimitry Andric     Type *VTy = Ty;
397304eeddc0SDimitry Andric     if (VF.isVector())
397404eeddc0SDimitry Andric       VTy = VectorType::get(Ty, VF);
39755ffd83dbSDimitry Andric     const Align Alignment = getLoadStoreAlignment(I);
3976480093f4SDimitry Andric     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
397704eeddc0SDimitry Andric                                 TTI.isLegalMaskedGather(VTy, Alignment))
3978480093f4SDimitry Andric                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
397904eeddc0SDimitry Andric                                 TTI.isLegalMaskedScatter(VTy, Alignment));
39800b57cec5SDimitry Andric   }
39810b57cec5SDimitry Andric   case Instruction::UDiv:
39820b57cec5SDimitry Andric   case Instruction::SDiv:
39830b57cec5SDimitry Andric   case Instruction::SRem:
3984bdd1243dSDimitry Andric   case Instruction::URem: {
3985bdd1243dSDimitry Andric     // We have the option to use the safe-divisor idiom to avoid predication.
3986bdd1243dSDimitry Andric     // The cost based decision here will always select safe-divisor for
3987bdd1243dSDimitry Andric     // scalable vectors as scalarization isn't legal.
3988bdd1243dSDimitry Andric     const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
3989bdd1243dSDimitry Andric     return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
3990bdd1243dSDimitry Andric   }
3991bdd1243dSDimitry Andric   }
3992bdd1243dSDimitry Andric }
3993bdd1243dSDimitry Andric 
isPredicatedInst(Instruction * I) const3994bdd1243dSDimitry Andric bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
3995bdd1243dSDimitry Andric   if (!blockNeedsPredicationForAnyReason(I->getParent()))
3996bdd1243dSDimitry Andric     return false;
3997bdd1243dSDimitry Andric 
3998bdd1243dSDimitry Andric   // Can we prove this instruction is safe to unconditionally execute?
3999bdd1243dSDimitry Andric   // If not, we must use some form of predication.
4000bdd1243dSDimitry Andric   switch(I->getOpcode()) {
4001bdd1243dSDimitry Andric   default:
4002bdd1243dSDimitry Andric     return false;
4003bdd1243dSDimitry Andric   case Instruction::Load:
4004bdd1243dSDimitry Andric   case Instruction::Store: {
4005bdd1243dSDimitry Andric     if (!Legal->isMaskRequired(I))
4006bdd1243dSDimitry Andric       return false;
4007bdd1243dSDimitry Andric     // When we know the load's address is loop invariant and the instruction
4008bdd1243dSDimitry Andric     // in the original scalar loop was unconditionally executed then we
4009bdd1243dSDimitry Andric     // don't need to mark it as a predicated instruction. Tail folding may
4010bdd1243dSDimitry Andric     // introduce additional predication, but we're guaranteed to always have
4011bdd1243dSDimitry Andric     // at least one active lane.  We call Legal->blockNeedsPredication here
4012bdd1243dSDimitry Andric     // because it doesn't query tail-folding.  For stores, we need to prove
4013bdd1243dSDimitry Andric     // both speculation safety (which follows from the same argument as loads),
4014bdd1243dSDimitry Andric     // but also must prove the value being stored is correct.  The easiest
4015bdd1243dSDimitry Andric     // form of the later is to require that all values stored are the same.
4016fe013be4SDimitry Andric     if (Legal->isInvariant(getLoadStorePointerOperand(I)) &&
4017bdd1243dSDimitry Andric         (isa<LoadInst>(I) ||
4018bdd1243dSDimitry Andric          (isa<StoreInst>(I) &&
4019bdd1243dSDimitry Andric           TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) &&
4020bdd1243dSDimitry Andric         !Legal->blockNeedsPredication(I->getParent()))
4021bdd1243dSDimitry Andric       return false;
4022bdd1243dSDimitry Andric     return true;
4023bdd1243dSDimitry Andric   }
4024bdd1243dSDimitry Andric   case Instruction::UDiv:
4025bdd1243dSDimitry Andric   case Instruction::SDiv:
4026bdd1243dSDimitry Andric   case Instruction::SRem:
40270b57cec5SDimitry Andric   case Instruction::URem:
4028fcaf7f86SDimitry Andric     // TODO: We can use the loop-preheader as context point here and get
4029fcaf7f86SDimitry Andric     // context sensitive reasoning
4030fcaf7f86SDimitry Andric     return !isSafeToSpeculativelyExecute(I);
4031fe013be4SDimitry Andric   case Instruction::Call:
4032fe013be4SDimitry Andric     return Legal->isMaskRequired(I);
40330b57cec5SDimitry Andric   }
4034bdd1243dSDimitry Andric }
4035bdd1243dSDimitry Andric 
4036bdd1243dSDimitry Andric std::pair<InstructionCost, InstructionCost>
getDivRemSpeculationCost(Instruction * I,ElementCount VF) const4037bdd1243dSDimitry Andric LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
4038bdd1243dSDimitry Andric                                                     ElementCount VF) const {
4039bdd1243dSDimitry Andric   assert(I->getOpcode() == Instruction::UDiv ||
4040bdd1243dSDimitry Andric          I->getOpcode() == Instruction::SDiv ||
4041bdd1243dSDimitry Andric          I->getOpcode() == Instruction::SRem ||
4042bdd1243dSDimitry Andric          I->getOpcode() == Instruction::URem);
4043bdd1243dSDimitry Andric   assert(!isSafeToSpeculativelyExecute(I));
4044bdd1243dSDimitry Andric 
4045bdd1243dSDimitry Andric   const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4046bdd1243dSDimitry Andric 
4047bdd1243dSDimitry Andric   // Scalarization isn't legal for scalable vector types
4048bdd1243dSDimitry Andric   InstructionCost ScalarizationCost = InstructionCost::getInvalid();
4049bdd1243dSDimitry Andric   if (!VF.isScalable()) {
4050bdd1243dSDimitry Andric     // Get the scalarization cost and scale this amount by the probability of
4051bdd1243dSDimitry Andric     // executing the predicated block. If the instruction is not predicated,
4052bdd1243dSDimitry Andric     // we fall through to the next case.
4053bdd1243dSDimitry Andric     ScalarizationCost = 0;
4054bdd1243dSDimitry Andric 
4055bdd1243dSDimitry Andric     // These instructions have a non-void type, so account for the phi nodes
4056bdd1243dSDimitry Andric     // that we will create. This cost is likely to be zero. The phi node
4057bdd1243dSDimitry Andric     // cost, if any, should be scaled by the block probability because it
4058bdd1243dSDimitry Andric     // models a copy at the end of each predicated block.
4059bdd1243dSDimitry Andric     ScalarizationCost += VF.getKnownMinValue() *
4060bdd1243dSDimitry Andric       TTI.getCFInstrCost(Instruction::PHI, CostKind);
4061bdd1243dSDimitry Andric 
4062bdd1243dSDimitry Andric     // The cost of the non-predicated instruction.
4063bdd1243dSDimitry Andric     ScalarizationCost += VF.getKnownMinValue() *
4064bdd1243dSDimitry Andric       TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
4065bdd1243dSDimitry Andric 
4066bdd1243dSDimitry Andric     // The cost of insertelement and extractelement instructions needed for
4067bdd1243dSDimitry Andric     // scalarization.
4068bdd1243dSDimitry Andric     ScalarizationCost += getScalarizationOverhead(I, VF, CostKind);
4069bdd1243dSDimitry Andric 
4070bdd1243dSDimitry Andric     // Scale the cost by the probability of executing the predicated blocks.
4071bdd1243dSDimitry Andric     // This assumes the predicated block for each vector lane is equally
4072bdd1243dSDimitry Andric     // likely.
4073bdd1243dSDimitry Andric     ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
4074bdd1243dSDimitry Andric   }
4075bdd1243dSDimitry Andric   InstructionCost SafeDivisorCost = 0;
4076bdd1243dSDimitry Andric 
4077bdd1243dSDimitry Andric   auto *VecTy = ToVectorTy(I->getType(), VF);
4078bdd1243dSDimitry Andric 
4079bdd1243dSDimitry Andric   // The cost of the select guard to ensure all lanes are well defined
4080bdd1243dSDimitry Andric   // after we speculate above any internal control flow.
4081bdd1243dSDimitry Andric   SafeDivisorCost += TTI.getCmpSelInstrCost(
4082bdd1243dSDimitry Andric     Instruction::Select, VecTy,
4083bdd1243dSDimitry Andric     ToVectorTy(Type::getInt1Ty(I->getContext()), VF),
4084bdd1243dSDimitry Andric     CmpInst::BAD_ICMP_PREDICATE, CostKind);
4085bdd1243dSDimitry Andric 
4086bdd1243dSDimitry Andric   // Certain instructions can be cheaper to vectorize if they have a constant
4087bdd1243dSDimitry Andric   // second vector operand. One example of this are shifts on x86.
4088bdd1243dSDimitry Andric   Value *Op2 = I->getOperand(1);
4089bdd1243dSDimitry Andric   auto Op2Info = TTI.getOperandInfo(Op2);
4090fe013be4SDimitry Andric   if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
4091fe013be4SDimitry Andric       Legal->isInvariant(Op2))
4092bdd1243dSDimitry Andric     Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
4093bdd1243dSDimitry Andric 
4094bdd1243dSDimitry Andric   SmallVector<const Value *, 4> Operands(I->operand_values());
4095bdd1243dSDimitry Andric   SafeDivisorCost += TTI.getArithmeticInstrCost(
4096bdd1243dSDimitry Andric     I->getOpcode(), VecTy, CostKind,
4097bdd1243dSDimitry Andric     {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
4098bdd1243dSDimitry Andric     Op2Info, Operands, I);
4099bdd1243dSDimitry Andric   return {ScalarizationCost, SafeDivisorCost};
41000b57cec5SDimitry Andric }
41010b57cec5SDimitry Andric 
interleavedAccessCanBeWidened(Instruction * I,ElementCount VF)4102e8d8bef9SDimitry Andric bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4103e8d8bef9SDimitry Andric     Instruction *I, ElementCount VF) {
41040b57cec5SDimitry Andric   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
41050b57cec5SDimitry Andric   assert(getWideningDecision(I, VF) == CM_Unknown &&
41060b57cec5SDimitry Andric          "Decision should not be set yet.");
41070b57cec5SDimitry Andric   auto *Group = getInterleavedAccessGroup(I);
41080b57cec5SDimitry Andric   assert(Group && "Must have a group.");
41090b57cec5SDimitry Andric 
41100b57cec5SDimitry Andric   // If the instruction's allocated size doesn't equal it's type size, it
41110b57cec5SDimitry Andric   // requires padding and will be scalarized.
41120b57cec5SDimitry Andric   auto &DL = I->getModule()->getDataLayout();
4113fe6060f1SDimitry Andric   auto *ScalarTy = getLoadStoreType(I);
4114d409305fSDimitry Andric   if (hasIrregularType(ScalarTy, DL))
41150b57cec5SDimitry Andric     return false;
41160b57cec5SDimitry Andric 
411781ad6265SDimitry Andric   // If the group involves a non-integral pointer, we may not be able to
411881ad6265SDimitry Andric   // losslessly cast all values to a common type.
411981ad6265SDimitry Andric   unsigned InterleaveFactor = Group->getFactor();
412081ad6265SDimitry Andric   bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
412181ad6265SDimitry Andric   for (unsigned i = 0; i < InterleaveFactor; i++) {
412281ad6265SDimitry Andric     Instruction *Member = Group->getMember(i);
412381ad6265SDimitry Andric     if (!Member)
412481ad6265SDimitry Andric       continue;
412581ad6265SDimitry Andric     auto *MemberTy = getLoadStoreType(Member);
412681ad6265SDimitry Andric     bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
412781ad6265SDimitry Andric     // Don't coerce non-integral pointers to integers or vice versa.
412881ad6265SDimitry Andric     if (MemberNI != ScalarNI) {
412981ad6265SDimitry Andric       // TODO: Consider adding special nullptr value case here
413081ad6265SDimitry Andric       return false;
413181ad6265SDimitry Andric     } else if (MemberNI && ScalarNI &&
413281ad6265SDimitry Andric                ScalarTy->getPointerAddressSpace() !=
413381ad6265SDimitry Andric                MemberTy->getPointerAddressSpace()) {
413481ad6265SDimitry Andric       return false;
413581ad6265SDimitry Andric     }
413681ad6265SDimitry Andric   }
413781ad6265SDimitry Andric 
41380b57cec5SDimitry Andric   // Check if masking is required.
41390b57cec5SDimitry Andric   // A Group may need masking for one of two reasons: it resides in a block that
4140349cc55cSDimitry Andric   // needs predication, or it was decided to use masking to deal with gaps
4141349cc55cSDimitry Andric   // (either a gap at the end of a load-access that may result in a speculative
4142349cc55cSDimitry Andric   // load, or any gaps in a store-access).
41430b57cec5SDimitry Andric   bool PredicatedAccessRequiresMasking =
4144349cc55cSDimitry Andric       blockNeedsPredicationForAnyReason(I->getParent()) &&
4145349cc55cSDimitry Andric       Legal->isMaskRequired(I);
4146349cc55cSDimitry Andric   bool LoadAccessWithGapsRequiresEpilogMasking =
4147349cc55cSDimitry Andric       isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
4148349cc55cSDimitry Andric       !isScalarEpilogueAllowed();
4149349cc55cSDimitry Andric   bool StoreAccessWithGapsRequiresMasking =
4150349cc55cSDimitry Andric       isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
4151349cc55cSDimitry Andric   if (!PredicatedAccessRequiresMasking &&
4152349cc55cSDimitry Andric       !LoadAccessWithGapsRequiresEpilogMasking &&
4153349cc55cSDimitry Andric       !StoreAccessWithGapsRequiresMasking)
41540b57cec5SDimitry Andric     return true;
41550b57cec5SDimitry Andric 
41560b57cec5SDimitry Andric   // If masked interleaving is required, we expect that the user/target had
41570b57cec5SDimitry Andric   // enabled it, because otherwise it either wouldn't have been created or
41580b57cec5SDimitry Andric   // it should have been invalidated by the CostModel.
41590b57cec5SDimitry Andric   assert(useMaskedInterleavedAccesses(TTI) &&
41600b57cec5SDimitry Andric          "Masked interleave-groups for predicated accesses are not enabled.");
41610b57cec5SDimitry Andric 
4162349cc55cSDimitry Andric   if (Group->isReverse())
4163349cc55cSDimitry Andric     return false;
4164349cc55cSDimitry Andric 
4165fe6060f1SDimitry Andric   auto *Ty = getLoadStoreType(I);
41665ffd83dbSDimitry Andric   const Align Alignment = getLoadStoreAlignment(I);
41678bcb0991SDimitry Andric   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
41688bcb0991SDimitry Andric                           : TTI.isLegalMaskedStore(Ty, Alignment);
41690b57cec5SDimitry Andric }
41700b57cec5SDimitry Andric 
memoryInstructionCanBeWidened(Instruction * I,ElementCount VF)4171e8d8bef9SDimitry Andric bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4172e8d8bef9SDimitry Andric     Instruction *I, ElementCount VF) {
41730b57cec5SDimitry Andric   // Get and ensure we have a valid memory instruction.
4174349cc55cSDimitry Andric   assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
41750b57cec5SDimitry Andric 
41760b57cec5SDimitry Andric   auto *Ptr = getLoadStorePointerOperand(I);
4177349cc55cSDimitry Andric   auto *ScalarTy = getLoadStoreType(I);
41780b57cec5SDimitry Andric 
41790b57cec5SDimitry Andric   // In order to be widened, the pointer should be consecutive, first of all.
4180349cc55cSDimitry Andric   if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
41810b57cec5SDimitry Andric     return false;
41820b57cec5SDimitry Andric 
41830b57cec5SDimitry Andric   // If the instruction is a store located in a predicated block, it will be
41840b57cec5SDimitry Andric   // scalarized.
418504eeddc0SDimitry Andric   if (isScalarWithPredication(I, VF))
41860b57cec5SDimitry Andric     return false;
41870b57cec5SDimitry Andric 
41880b57cec5SDimitry Andric   // If the instruction's allocated size doesn't equal it's type size, it
41890b57cec5SDimitry Andric   // requires padding and will be scalarized.
41900b57cec5SDimitry Andric   auto &DL = I->getModule()->getDataLayout();
4191d409305fSDimitry Andric   if (hasIrregularType(ScalarTy, DL))
41920b57cec5SDimitry Andric     return false;
41930b57cec5SDimitry Andric 
41940b57cec5SDimitry Andric   return true;
41950b57cec5SDimitry Andric }
41960b57cec5SDimitry Andric 
collectLoopUniforms(ElementCount VF)4197e8d8bef9SDimitry Andric void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
41980b57cec5SDimitry Andric   // We should not collect Uniforms more than once per VF. Right now,
41990b57cec5SDimitry Andric   // this function is called from collectUniformsAndScalars(), which
42000b57cec5SDimitry Andric   // already does this check. Collecting Uniforms for VF=1 does not make any
42010b57cec5SDimitry Andric   // sense.
42020b57cec5SDimitry Andric 
4203fe013be4SDimitry Andric   assert(VF.isVector() && !Uniforms.contains(VF) &&
42040b57cec5SDimitry Andric          "This function should not be visited twice for the same VF");
42050b57cec5SDimitry Andric 
42060b57cec5SDimitry Andric   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
42070b57cec5SDimitry Andric   // not analyze again.  Uniforms.count(VF) will return 1.
42080b57cec5SDimitry Andric   Uniforms[VF].clear();
42090b57cec5SDimitry Andric 
42100b57cec5SDimitry Andric   // We now know that the loop is vectorizable!
42110b57cec5SDimitry Andric   // Collect instructions inside the loop that will remain uniform after
42120b57cec5SDimitry Andric   // vectorization.
42130b57cec5SDimitry Andric 
42140b57cec5SDimitry Andric   // Global values, params and instructions outside of current loop are out of
42150b57cec5SDimitry Andric   // scope.
42160b57cec5SDimitry Andric   auto isOutOfScope = [&](Value *V) -> bool {
42170b57cec5SDimitry Andric     Instruction *I = dyn_cast<Instruction>(V);
42180b57cec5SDimitry Andric     return (!I || !TheLoop->contains(I));
42190b57cec5SDimitry Andric   };
42200b57cec5SDimitry Andric 
4221349cc55cSDimitry Andric   // Worklist containing uniform instructions demanding lane 0.
42220b57cec5SDimitry Andric   SetVector<Instruction *> Worklist;
42230b57cec5SDimitry Andric   BasicBlock *Latch = TheLoop->getLoopLatch();
42240b57cec5SDimitry Andric 
4225349cc55cSDimitry Andric   // Add uniform instructions demanding lane 0 to the worklist. Instructions
4226349cc55cSDimitry Andric   // that are scalar with predication must not be considered uniform after
4227349cc55cSDimitry Andric   // vectorization, because that would create an erroneous replicating region
4228349cc55cSDimitry Andric   // where only a single instance out of VF should be formed.
4229480093f4SDimitry Andric   // TODO: optimize such seldom cases if found important, see PR40816.
4230480093f4SDimitry Andric   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4231e8d8bef9SDimitry Andric     if (isOutOfScope(I)) {
4232e8d8bef9SDimitry Andric       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
4233e8d8bef9SDimitry Andric                         << *I << "\n");
4234e8d8bef9SDimitry Andric       return;
4235e8d8bef9SDimitry Andric     }
423604eeddc0SDimitry Andric     if (isScalarWithPredication(I, VF)) {
4237480093f4SDimitry Andric       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4238480093f4SDimitry Andric                         << *I << "\n");
4239480093f4SDimitry Andric       return;
4240480093f4SDimitry Andric     }
4241480093f4SDimitry Andric     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4242480093f4SDimitry Andric     Worklist.insert(I);
4243480093f4SDimitry Andric   };
4244480093f4SDimitry Andric 
42450b57cec5SDimitry Andric   // Start with the conditional branch. If the branch condition is an
42460b57cec5SDimitry Andric   // instruction contained in the loop that is only used by the branch, it is
42470b57cec5SDimitry Andric   // uniform.
42480b57cec5SDimitry Andric   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4249480093f4SDimitry Andric   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4250480093f4SDimitry Andric     addToWorklistIfAllowed(Cmp);
42510b57cec5SDimitry Andric 
4252fe013be4SDimitry Andric   auto PrevVF = VF.divideCoefficientBy(2);
4253bdd1243dSDimitry Andric   // Return true if all lanes perform the same memory operation, and we can
4254bdd1243dSDimitry Andric   // thus chose to execute only one.
4255bdd1243dSDimitry Andric   auto isUniformMemOpUse = [&](Instruction *I) {
4256fe013be4SDimitry Andric     // If the value was already known to not be uniform for the previous
4257fe013be4SDimitry Andric     // (smaller VF), it cannot be uniform for the larger VF.
4258fe013be4SDimitry Andric     if (PrevVF.isVector()) {
4259fe013be4SDimitry Andric       auto Iter = Uniforms.find(PrevVF);
4260fe013be4SDimitry Andric       if (Iter != Uniforms.end() && !Iter->second.contains(I))
4261fe013be4SDimitry Andric         return false;
4262fe013be4SDimitry Andric     }
4263fe013be4SDimitry Andric     if (!Legal->isUniformMemOp(*I, VF))
4264bdd1243dSDimitry Andric       return false;
4265bdd1243dSDimitry Andric     if (isa<LoadInst>(I))
4266bdd1243dSDimitry Andric       // Loading the same address always produces the same result - at least
4267bdd1243dSDimitry Andric       // assuming aliasing and ordering which have already been checked.
4268bdd1243dSDimitry Andric       return true;
4269bdd1243dSDimitry Andric     // Storing the same value on every iteration.
4270bdd1243dSDimitry Andric     return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
4271bdd1243dSDimitry Andric   };
4272bdd1243dSDimitry Andric 
4273e8d8bef9SDimitry Andric   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
42740b57cec5SDimitry Andric     InstWidening WideningDecision = getWideningDecision(I, VF);
42750b57cec5SDimitry Andric     assert(WideningDecision != CM_Unknown &&
42760b57cec5SDimitry Andric            "Widening decision should be ready at this moment");
42770b57cec5SDimitry Andric 
4278bdd1243dSDimitry Andric     if (isUniformMemOpUse(I))
4279e8d8bef9SDimitry Andric       return true;
4280e8d8bef9SDimitry Andric 
42810b57cec5SDimitry Andric     return (WideningDecision == CM_Widen ||
42820b57cec5SDimitry Andric             WideningDecision == CM_Widen_Reverse ||
42830b57cec5SDimitry Andric             WideningDecision == CM_Interleave);
42840b57cec5SDimitry Andric   };
4285e8d8bef9SDimitry Andric 
4286e8d8bef9SDimitry Andric   // Returns true if Ptr is the pointer operand of a memory access instruction
4287dbbaf778SDimitry Andric   // I, I is known to not require scalarization, and the pointer is not also
4288dbbaf778SDimitry Andric   // stored.
4289e8d8bef9SDimitry Andric   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4290fe013be4SDimitry Andric     if (isa<StoreInst>(I) && I->getOperand(0) == Ptr)
4291fe013be4SDimitry Andric       return false;
4292fe013be4SDimitry Andric     return getLoadStorePointerOperand(I) == Ptr &&
4293fe013be4SDimitry Andric            (isUniformDecision(I, VF) || Legal->isInvariant(Ptr));
4294e8d8bef9SDimitry Andric   };
4295e8d8bef9SDimitry Andric 
4296e8d8bef9SDimitry Andric   // Holds a list of values which are known to have at least one uniform use.
4297e8d8bef9SDimitry Andric   // Note that there may be other uses which aren't uniform.  A "uniform use"
4298e8d8bef9SDimitry Andric   // here is something which only demands lane 0 of the unrolled iterations;
4299e8d8bef9SDimitry Andric   // it does not imply that all lanes produce the same value (e.g. this is not
4300e8d8bef9SDimitry Andric   // the usual meaning of uniform)
4301fe6060f1SDimitry Andric   SetVector<Value *> HasUniformUse;
4302e8d8bef9SDimitry Andric 
4303e8d8bef9SDimitry Andric   // Scan the loop for instructions which are either a) known to have only
4304e8d8bef9SDimitry Andric   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
43050b57cec5SDimitry Andric   for (auto *BB : TheLoop->blocks())
43060b57cec5SDimitry Andric     for (auto &I : *BB) {
43076e75b2fbSDimitry Andric       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
43086e75b2fbSDimitry Andric         switch (II->getIntrinsicID()) {
43096e75b2fbSDimitry Andric         case Intrinsic::sideeffect:
43106e75b2fbSDimitry Andric         case Intrinsic::experimental_noalias_scope_decl:
43116e75b2fbSDimitry Andric         case Intrinsic::assume:
43126e75b2fbSDimitry Andric         case Intrinsic::lifetime_start:
43136e75b2fbSDimitry Andric         case Intrinsic::lifetime_end:
43146e75b2fbSDimitry Andric           if (TheLoop->hasLoopInvariantOperands(&I))
43156e75b2fbSDimitry Andric             addToWorklistIfAllowed(&I);
43166e75b2fbSDimitry Andric           break;
43176e75b2fbSDimitry Andric         default:
43186e75b2fbSDimitry Andric           break;
43196e75b2fbSDimitry Andric         }
43206e75b2fbSDimitry Andric       }
43216e75b2fbSDimitry Andric 
4322349cc55cSDimitry Andric       // ExtractValue instructions must be uniform, because the operands are
4323349cc55cSDimitry Andric       // known to be loop-invariant.
4324349cc55cSDimitry Andric       if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
4325349cc55cSDimitry Andric         assert(isOutOfScope(EVI->getAggregateOperand()) &&
4326349cc55cSDimitry Andric                "Expected aggregate value to be loop invariant");
4327349cc55cSDimitry Andric         addToWorklistIfAllowed(EVI);
4328349cc55cSDimitry Andric         continue;
4329349cc55cSDimitry Andric       }
4330349cc55cSDimitry Andric 
43310b57cec5SDimitry Andric       // If there's no pointer operand, there's nothing to do.
4332e8d8bef9SDimitry Andric       auto *Ptr = getLoadStorePointerOperand(&I);
43330b57cec5SDimitry Andric       if (!Ptr)
43340b57cec5SDimitry Andric         continue;
43350b57cec5SDimitry Andric 
4336bdd1243dSDimitry Andric       if (isUniformMemOpUse(&I))
4337e8d8bef9SDimitry Andric         addToWorklistIfAllowed(&I);
43380b57cec5SDimitry Andric 
4339fe013be4SDimitry Andric       if (isVectorizedMemAccessUse(&I, Ptr))
4340e8d8bef9SDimitry Andric         HasUniformUse.insert(Ptr);
4341e8d8bef9SDimitry Andric     }
43420b57cec5SDimitry Andric 
4343e8d8bef9SDimitry Andric   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
4344e8d8bef9SDimitry Andric   // demanding) users.  Since loops are assumed to be in LCSSA form, this
4345e8d8bef9SDimitry Andric   // disallows uses outside the loop as well.
4346e8d8bef9SDimitry Andric   for (auto *V : HasUniformUse) {
4347e8d8bef9SDimitry Andric     if (isOutOfScope(V))
4348e8d8bef9SDimitry Andric       continue;
4349e8d8bef9SDimitry Andric     auto *I = cast<Instruction>(V);
4350e8d8bef9SDimitry Andric     auto UsersAreMemAccesses =
4351e8d8bef9SDimitry Andric       llvm::all_of(I->users(), [&](User *U) -> bool {
4352e8d8bef9SDimitry Andric         return isVectorizedMemAccessUse(cast<Instruction>(U), V);
4353e8d8bef9SDimitry Andric       });
4354e8d8bef9SDimitry Andric     if (UsersAreMemAccesses)
4355e8d8bef9SDimitry Andric       addToWorklistIfAllowed(I);
4356e8d8bef9SDimitry Andric   }
43570b57cec5SDimitry Andric 
43580b57cec5SDimitry Andric   // Expand Worklist in topological order: whenever a new instruction
43590b57cec5SDimitry Andric   // is added , its users should be already inside Worklist.  It ensures
43600b57cec5SDimitry Andric   // a uniform instruction will only be used by uniform instructions.
43610b57cec5SDimitry Andric   unsigned idx = 0;
43620b57cec5SDimitry Andric   while (idx != Worklist.size()) {
43630b57cec5SDimitry Andric     Instruction *I = Worklist[idx++];
43640b57cec5SDimitry Andric 
4365bdd1243dSDimitry Andric     for (auto *OV : I->operand_values()) {
43660b57cec5SDimitry Andric       // isOutOfScope operands cannot be uniform instructions.
43670b57cec5SDimitry Andric       if (isOutOfScope(OV))
43680b57cec5SDimitry Andric         continue;
43690b57cec5SDimitry Andric       // First order recurrence Phi's should typically be considered
43700b57cec5SDimitry Andric       // non-uniform.
43710b57cec5SDimitry Andric       auto *OP = dyn_cast<PHINode>(OV);
4372bdd1243dSDimitry Andric       if (OP && Legal->isFixedOrderRecurrence(OP))
43730b57cec5SDimitry Andric         continue;
43740b57cec5SDimitry Andric       // If all the users of the operand are uniform, then add the
43750b57cec5SDimitry Andric       // operand into the uniform worklist.
43760b57cec5SDimitry Andric       auto *OI = cast<Instruction>(OV);
43770b57cec5SDimitry Andric       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
43780b57cec5SDimitry Andric             auto *J = cast<Instruction>(U);
4379e8d8bef9SDimitry Andric             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
4380480093f4SDimitry Andric           }))
4381480093f4SDimitry Andric         addToWorklistIfAllowed(OI);
43820b57cec5SDimitry Andric     }
43830b57cec5SDimitry Andric   }
43840b57cec5SDimitry Andric 
43850b57cec5SDimitry Andric   // For an instruction to be added into Worklist above, all its users inside
43860b57cec5SDimitry Andric   // the loop should also be in Worklist. However, this condition cannot be
43870b57cec5SDimitry Andric   // true for phi nodes that form a cyclic dependence. We must process phi
43880b57cec5SDimitry Andric   // nodes separately. An induction variable will remain uniform if all users
43890b57cec5SDimitry Andric   // of the induction variable and induction variable update remain uniform.
43900b57cec5SDimitry Andric   // The code below handles both pointer and non-pointer induction variables.
4391bdd1243dSDimitry Andric   for (const auto &Induction : Legal->getInductionVars()) {
43920b57cec5SDimitry Andric     auto *Ind = Induction.first;
43930b57cec5SDimitry Andric     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
43940b57cec5SDimitry Andric 
43950b57cec5SDimitry Andric     // Determine if all users of the induction variable are uniform after
43960b57cec5SDimitry Andric     // vectorization.
43970b57cec5SDimitry Andric     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
43980b57cec5SDimitry Andric       auto *I = cast<Instruction>(U);
43990b57cec5SDimitry Andric       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
44000b57cec5SDimitry Andric              isVectorizedMemAccessUse(I, Ind);
44010b57cec5SDimitry Andric     });
44020b57cec5SDimitry Andric     if (!UniformInd)
44030b57cec5SDimitry Andric       continue;
44040b57cec5SDimitry Andric 
44050b57cec5SDimitry Andric     // Determine if all users of the induction variable update instruction are
44060b57cec5SDimitry Andric     // uniform after vectorization.
44070b57cec5SDimitry Andric     auto UniformIndUpdate =
44080b57cec5SDimitry Andric         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
44090b57cec5SDimitry Andric           auto *I = cast<Instruction>(U);
44100b57cec5SDimitry Andric           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
44110b57cec5SDimitry Andric                  isVectorizedMemAccessUse(I, IndUpdate);
44120b57cec5SDimitry Andric         });
44130b57cec5SDimitry Andric     if (!UniformIndUpdate)
44140b57cec5SDimitry Andric       continue;
44150b57cec5SDimitry Andric 
44160b57cec5SDimitry Andric     // The induction variable and its update instruction will remain uniform.
4417480093f4SDimitry Andric     addToWorklistIfAllowed(Ind);
4418480093f4SDimitry Andric     addToWorklistIfAllowed(IndUpdate);
44190b57cec5SDimitry Andric   }
44200b57cec5SDimitry Andric 
44210b57cec5SDimitry Andric   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
44220b57cec5SDimitry Andric }
44230b57cec5SDimitry Andric 
runtimeChecksRequired()44248bcb0991SDimitry Andric bool LoopVectorizationCostModel::runtimeChecksRequired() {
44258bcb0991SDimitry Andric   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
44260b57cec5SDimitry Andric 
44270b57cec5SDimitry Andric   if (Legal->getRuntimePointerChecking()->Need) {
44288bcb0991SDimitry Andric     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
44298bcb0991SDimitry Andric         "runtime pointer checks needed. Enable vectorization of this "
44300b57cec5SDimitry Andric         "loop with '#pragma clang loop vectorize(enable)' when "
44318bcb0991SDimitry Andric         "compiling with -Os/-Oz",
44328bcb0991SDimitry Andric         "CantVersionLoopWithOptForSize", ORE, TheLoop);
44338bcb0991SDimitry Andric     return true;
44340b57cec5SDimitry Andric   }
44350b57cec5SDimitry Andric 
443681ad6265SDimitry Andric   if (!PSE.getPredicate().isAlwaysTrue()) {
44378bcb0991SDimitry Andric     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
44388bcb0991SDimitry Andric         "runtime SCEV checks needed. Enable vectorization of this "
44390b57cec5SDimitry Andric         "loop with '#pragma clang loop vectorize(enable)' when "
44408bcb0991SDimitry Andric         "compiling with -Os/-Oz",
44418bcb0991SDimitry Andric         "CantVersionLoopWithOptForSize", ORE, TheLoop);
44428bcb0991SDimitry Andric     return true;
44430b57cec5SDimitry Andric   }
44440b57cec5SDimitry Andric 
44450b57cec5SDimitry Andric   // FIXME: Avoid specializing for stride==1 instead of bailing out.
44460b57cec5SDimitry Andric   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
44475ffd83dbSDimitry Andric     reportVectorizationFailure("Runtime stride check for small trip count",
44488bcb0991SDimitry Andric         "runtime stride == 1 checks needed. Enable vectorization of "
44495ffd83dbSDimitry Andric         "this loop without such check by compiling with -Os/-Oz",
44508bcb0991SDimitry Andric         "CantVersionLoopWithOptForSize", ORE, TheLoop);
44518bcb0991SDimitry Andric     return true;
44528bcb0991SDimitry Andric   }
44538bcb0991SDimitry Andric 
44548bcb0991SDimitry Andric   return false;
44558bcb0991SDimitry Andric }
44568bcb0991SDimitry Andric 
4457fe6060f1SDimitry Andric ElementCount
getMaxLegalScalableVF(unsigned MaxSafeElements)4458fe6060f1SDimitry Andric LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
4459349cc55cSDimitry Andric   if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
4460fe6060f1SDimitry Andric     return ElementCount::getScalable(0);
4461fe6060f1SDimitry Andric 
4462fe6060f1SDimitry Andric   if (Hints->isScalableVectorizationDisabled()) {
4463fe6060f1SDimitry Andric     reportVectorizationInfo("Scalable vectorization is explicitly disabled",
4464fe6060f1SDimitry Andric                             "ScalableVectorizationDisabled", ORE, TheLoop);
4465fe6060f1SDimitry Andric     return ElementCount::getScalable(0);
4466fe6060f1SDimitry Andric   }
4467fe6060f1SDimitry Andric 
4468349cc55cSDimitry Andric   LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
4469349cc55cSDimitry Andric 
4470fe6060f1SDimitry Andric   auto MaxScalableVF = ElementCount::getScalable(
4471fe6060f1SDimitry Andric       std::numeric_limits<ElementCount::ScalarTy>::max());
4472fe6060f1SDimitry Andric 
4473fe6060f1SDimitry Andric   // Test that the loop-vectorizer can legalize all operations for this MaxVF.
4474fe6060f1SDimitry Andric   // FIXME: While for scalable vectors this is currently sufficient, this should
4475fe6060f1SDimitry Andric   // be replaced by a more detailed mechanism that filters out specific VFs,
4476fe6060f1SDimitry Andric   // instead of invalidating vectorization for a whole set of VFs based on the
4477fe6060f1SDimitry Andric   // MaxVF.
4478fe6060f1SDimitry Andric 
4479fe6060f1SDimitry Andric   // Disable scalable vectorization if the loop contains unsupported reductions.
4480fe6060f1SDimitry Andric   if (!canVectorizeReductions(MaxScalableVF)) {
4481fe6060f1SDimitry Andric     reportVectorizationInfo(
4482fe6060f1SDimitry Andric         "Scalable vectorization not supported for the reduction "
4483fe6060f1SDimitry Andric         "operations found in this loop.",
4484fe6060f1SDimitry Andric         "ScalableVFUnfeasible", ORE, TheLoop);
4485fe6060f1SDimitry Andric     return ElementCount::getScalable(0);
4486fe6060f1SDimitry Andric   }
4487fe6060f1SDimitry Andric 
4488fe6060f1SDimitry Andric   // Disable scalable vectorization if the loop contains any instructions
4489fe6060f1SDimitry Andric   // with element types not supported for scalable vectors.
4490fe6060f1SDimitry Andric   if (any_of(ElementTypesInLoop, [&](Type *Ty) {
4491fe6060f1SDimitry Andric         return !Ty->isVoidTy() &&
4492fe6060f1SDimitry Andric                !this->TTI.isElementTypeLegalForScalableVector(Ty);
4493fe6060f1SDimitry Andric       })) {
4494fe6060f1SDimitry Andric     reportVectorizationInfo("Scalable vectorization is not supported "
4495fe6060f1SDimitry Andric                             "for all element types found in this loop.",
4496fe6060f1SDimitry Andric                             "ScalableVFUnfeasible", ORE, TheLoop);
4497fe6060f1SDimitry Andric     return ElementCount::getScalable(0);
4498fe6060f1SDimitry Andric   }
4499fe6060f1SDimitry Andric 
4500fe6060f1SDimitry Andric   if (Legal->isSafeForAnyVectorWidth())
4501fe6060f1SDimitry Andric     return MaxScalableVF;
4502fe6060f1SDimitry Andric 
4503fe6060f1SDimitry Andric   // Limit MaxScalableVF by the maximum safe dependence distance.
4504fe013be4SDimitry Andric   if (std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI))
4505fe013be4SDimitry Andric     MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);
4506fe013be4SDimitry Andric   else
4507fe013be4SDimitry Andric     MaxScalableVF = ElementCount::getScalable(0);
4508fe013be4SDimitry Andric 
4509fe6060f1SDimitry Andric   if (!MaxScalableVF)
4510fe6060f1SDimitry Andric     reportVectorizationInfo(
4511fe6060f1SDimitry Andric         "Max legal vector width too small, scalable vectorization "
4512fe6060f1SDimitry Andric         "unfeasible.",
4513fe6060f1SDimitry Andric         "ScalableVFUnfeasible", ORE, TheLoop);
4514fe6060f1SDimitry Andric 
4515fe6060f1SDimitry Andric   return MaxScalableVF;
4516fe6060f1SDimitry Andric }
4517fe6060f1SDimitry Andric 
computeFeasibleMaxVF(unsigned MaxTripCount,ElementCount UserVF,bool FoldTailByMasking)45180eae32dcSDimitry Andric FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
4519c9157d92SDimitry Andric     unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
4520fe6060f1SDimitry Andric   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4521fe6060f1SDimitry Andric   unsigned SmallestType, WidestType;
4522fe6060f1SDimitry Andric   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4523fe6060f1SDimitry Andric 
4524fe6060f1SDimitry Andric   // Get the maximum safe dependence distance in bits computed by LAA.
4525fe6060f1SDimitry Andric   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4526fe6060f1SDimitry Andric   // the memory accesses that is most restrictive (involved in the smallest
4527fe6060f1SDimitry Andric   // dependence distance).
4528fe6060f1SDimitry Andric   unsigned MaxSafeElements =
4529fe013be4SDimitry Andric       llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
4530fe6060f1SDimitry Andric 
4531fe6060f1SDimitry Andric   auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
4532fe6060f1SDimitry Andric   auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
4533fe6060f1SDimitry Andric 
4534fe6060f1SDimitry Andric   LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
4535fe6060f1SDimitry Andric                     << ".\n");
4536fe6060f1SDimitry Andric   LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
4537fe6060f1SDimitry Andric                     << ".\n");
4538fe6060f1SDimitry Andric 
4539fe6060f1SDimitry Andric   // First analyze the UserVF, fall back if the UserVF should be ignored.
4540fe6060f1SDimitry Andric   if (UserVF) {
4541fe6060f1SDimitry Andric     auto MaxSafeUserVF =
4542fe6060f1SDimitry Andric         UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
4543fe6060f1SDimitry Andric 
4544fe6060f1SDimitry Andric     if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
4545fe6060f1SDimitry Andric       // If `VF=vscale x N` is safe, then so is `VF=N`
4546fe6060f1SDimitry Andric       if (UserVF.isScalable())
4547fe6060f1SDimitry Andric         return FixedScalableVFPair(
4548fe6060f1SDimitry Andric             ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
4549fe6060f1SDimitry Andric       else
4550fe6060f1SDimitry Andric         return UserVF;
4551fe6060f1SDimitry Andric     }
4552fe6060f1SDimitry Andric 
4553fe6060f1SDimitry Andric     assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
4554fe6060f1SDimitry Andric 
4555fe6060f1SDimitry Andric     // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
4556fe6060f1SDimitry Andric     // is better to ignore the hint and let the compiler choose a suitable VF.
4557fe6060f1SDimitry Andric     if (!UserVF.isScalable()) {
4558fe6060f1SDimitry Andric       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4559fe6060f1SDimitry Andric                         << " is unsafe, clamping to max safe VF="
4560fe6060f1SDimitry Andric                         << MaxSafeFixedVF << ".\n");
4561fe6060f1SDimitry Andric       ORE->emit([&]() {
4562fe6060f1SDimitry Andric         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4563fe6060f1SDimitry Andric                                           TheLoop->getStartLoc(),
4564fe6060f1SDimitry Andric                                           TheLoop->getHeader())
4565fe6060f1SDimitry Andric                << "User-specified vectorization factor "
4566fe6060f1SDimitry Andric                << ore::NV("UserVectorizationFactor", UserVF)
4567fe6060f1SDimitry Andric                << " is unsafe, clamping to maximum safe vectorization factor "
4568fe6060f1SDimitry Andric                << ore::NV("VectorizationFactor", MaxSafeFixedVF);
4569fe6060f1SDimitry Andric       });
4570fe6060f1SDimitry Andric       return MaxSafeFixedVF;
4571fe6060f1SDimitry Andric     }
4572fe6060f1SDimitry Andric 
4573349cc55cSDimitry Andric     if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
4574349cc55cSDimitry Andric       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4575349cc55cSDimitry Andric                         << " is ignored because scalable vectors are not "
4576349cc55cSDimitry Andric                            "available.\n");
4577349cc55cSDimitry Andric       ORE->emit([&]() {
4578349cc55cSDimitry Andric         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4579349cc55cSDimitry Andric                                           TheLoop->getStartLoc(),
4580349cc55cSDimitry Andric                                           TheLoop->getHeader())
4581349cc55cSDimitry Andric                << "User-specified vectorization factor "
4582349cc55cSDimitry Andric                << ore::NV("UserVectorizationFactor", UserVF)
4583349cc55cSDimitry Andric                << " is ignored because the target does not support scalable "
4584349cc55cSDimitry Andric                   "vectors. The compiler will pick a more suitable value.";
4585349cc55cSDimitry Andric       });
4586349cc55cSDimitry Andric     } else {
4587fe6060f1SDimitry Andric       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4588fe6060f1SDimitry Andric                         << " is unsafe. Ignoring scalable UserVF.\n");
4589fe6060f1SDimitry Andric       ORE->emit([&]() {
4590fe6060f1SDimitry Andric         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4591fe6060f1SDimitry Andric                                           TheLoop->getStartLoc(),
4592fe6060f1SDimitry Andric                                           TheLoop->getHeader())
4593fe6060f1SDimitry Andric                << "User-specified vectorization factor "
4594fe6060f1SDimitry Andric                << ore::NV("UserVectorizationFactor", UserVF)
4595fe6060f1SDimitry Andric                << " is unsafe. Ignoring the hint to let the compiler pick a "
4596349cc55cSDimitry Andric                   "more suitable value.";
4597fe6060f1SDimitry Andric       });
4598fe6060f1SDimitry Andric     }
4599349cc55cSDimitry Andric   }
4600fe6060f1SDimitry Andric 
4601fe6060f1SDimitry Andric   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4602fe6060f1SDimitry Andric                     << " / " << WidestType << " bits.\n");
4603fe6060f1SDimitry Andric 
4604fe6060f1SDimitry Andric   FixedScalableVFPair Result(ElementCount::getFixed(1),
4605fe6060f1SDimitry Andric                              ElementCount::getScalable(0));
46060eae32dcSDimitry Andric   if (auto MaxVF =
4607c9157d92SDimitry Andric           getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
46080eae32dcSDimitry Andric                                   MaxSafeFixedVF, FoldTailByMasking))
4609fe6060f1SDimitry Andric     Result.FixedVF = MaxVF;
4610fe6060f1SDimitry Andric 
46110eae32dcSDimitry Andric   if (auto MaxVF =
4612c9157d92SDimitry Andric           getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
46130eae32dcSDimitry Andric                                   MaxSafeScalableVF, FoldTailByMasking))
4614fe6060f1SDimitry Andric     if (MaxVF.isScalable()) {
4615fe6060f1SDimitry Andric       Result.ScalableVF = MaxVF;
4616fe6060f1SDimitry Andric       LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
4617fe6060f1SDimitry Andric                         << "\n");
4618fe6060f1SDimitry Andric     }
4619fe6060f1SDimitry Andric 
4620fe6060f1SDimitry Andric   return Result;
4621fe6060f1SDimitry Andric }
4622fe6060f1SDimitry Andric 
4623fe6060f1SDimitry Andric FixedScalableVFPair
computeMaxVF(ElementCount UserVF,unsigned UserIC)4624e8d8bef9SDimitry Andric LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
46258bcb0991SDimitry Andric   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
46268bcb0991SDimitry Andric     // TODO: It may by useful to do since it's still likely to be dynamically
46278bcb0991SDimitry Andric     // uniform if the target can skip.
46288bcb0991SDimitry Andric     reportVectorizationFailure(
46298bcb0991SDimitry Andric         "Not inserting runtime ptr check for divergent target",
46308bcb0991SDimitry Andric         "runtime pointer checks needed. Not enabled for divergent target",
46318bcb0991SDimitry Andric         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4632fe6060f1SDimitry Andric     return FixedScalableVFPair::getNone();
46330b57cec5SDimitry Andric   }
46340b57cec5SDimitry Andric 
46358bcb0991SDimitry Andric   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4636c9157d92SDimitry Andric   unsigned MaxTC = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
46370b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
46380b57cec5SDimitry Andric   if (TC == 1) {
46398bcb0991SDimitry Andric     reportVectorizationFailure("Single iteration (non) loop",
46408bcb0991SDimitry Andric         "loop trip count is one, irrelevant for vectorization",
46418bcb0991SDimitry Andric         "SingleIterationLoop", ORE, TheLoop);
4642fe6060f1SDimitry Andric     return FixedScalableVFPair::getNone();
46430b57cec5SDimitry Andric   }
46440b57cec5SDimitry Andric 
46458bcb0991SDimitry Andric   switch (ScalarEpilogueStatus) {
46468bcb0991SDimitry Andric   case CM_ScalarEpilogueAllowed:
4647c9157d92SDimitry Andric     return computeFeasibleMaxVF(MaxTC, UserVF, false);
4648e8d8bef9SDimitry Andric   case CM_ScalarEpilogueNotAllowedUsePredicate:
4649bdd1243dSDimitry Andric     [[fallthrough]];
46508bcb0991SDimitry Andric   case CM_ScalarEpilogueNotNeededUsePredicate:
46518bcb0991SDimitry Andric     LLVM_DEBUG(
46528bcb0991SDimitry Andric         dbgs() << "LV: vector predicate hint/switch found.\n"
46538bcb0991SDimitry Andric                << "LV: Not allowing scalar epilogue, creating predicated "
46548bcb0991SDimitry Andric                << "vector loop.\n");
46558bcb0991SDimitry Andric     break;
46568bcb0991SDimitry Andric   case CM_ScalarEpilogueNotAllowedLowTripLoop:
46578bcb0991SDimitry Andric     // fallthrough as a special case of OptForSize
46588bcb0991SDimitry Andric   case CM_ScalarEpilogueNotAllowedOptSize:
46598bcb0991SDimitry Andric     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
46608bcb0991SDimitry Andric       LLVM_DEBUG(
46618bcb0991SDimitry Andric           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
46628bcb0991SDimitry Andric     else
46638bcb0991SDimitry Andric       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
46648bcb0991SDimitry Andric                         << "count.\n");
46650b57cec5SDimitry Andric 
46668bcb0991SDimitry Andric     // Bail if runtime checks are required, which are not good when optimising
46678bcb0991SDimitry Andric     // for size.
46688bcb0991SDimitry Andric     if (runtimeChecksRequired())
4669fe6060f1SDimitry Andric       return FixedScalableVFPair::getNone();
4670e8d8bef9SDimitry Andric 
46718bcb0991SDimitry Andric     break;
46728bcb0991SDimitry Andric   }
46730b57cec5SDimitry Andric 
4674e8d8bef9SDimitry Andric   // The only loops we can vectorize without a scalar epilogue, are loops with
4675e8d8bef9SDimitry Andric   // a bottom-test and a single exiting block. We'd have to handle the fact
4676e8d8bef9SDimitry Andric   // that not every instruction executes on the last iteration.  This will
4677e8d8bef9SDimitry Andric   // require a lane mask which varies through the vector loop body.  (TODO)
4678e8d8bef9SDimitry Andric   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
4679e8d8bef9SDimitry Andric     // If there was a tail-folding hint/switch, but we can't fold the tail by
4680e8d8bef9SDimitry Andric     // masking, fallback to a vectorization with a scalar epilogue.
4681e8d8bef9SDimitry Andric     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4682e8d8bef9SDimitry Andric       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4683e8d8bef9SDimitry Andric                            "scalar epilogue instead.\n");
4684e8d8bef9SDimitry Andric       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4685c9157d92SDimitry Andric       return computeFeasibleMaxVF(MaxTC, UserVF, false);
4686e8d8bef9SDimitry Andric     }
4687fe6060f1SDimitry Andric     return FixedScalableVFPair::getNone();
4688e8d8bef9SDimitry Andric   }
4689e8d8bef9SDimitry Andric 
46908bcb0991SDimitry Andric   // Now try the tail folding
46918bcb0991SDimitry Andric 
46920b57cec5SDimitry Andric   // Invalidate interleave groups that require an epilogue if we can't mask
46930b57cec5SDimitry Andric   // the interleave-group.
46945ffd83dbSDimitry Andric   if (!useMaskedInterleavedAccesses(TTI)) {
46955ffd83dbSDimitry Andric     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
46965ffd83dbSDimitry Andric            "No decisions should have been taken at this point");
46975ffd83dbSDimitry Andric     // Note: There is no need to invalidate any cost modeling decisions here, as
46985ffd83dbSDimitry Andric     // non where taken so far.
46990b57cec5SDimitry Andric     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
47005ffd83dbSDimitry Andric   }
47010b57cec5SDimitry Andric 
4702c9157d92SDimitry Andric   FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
4703fe013be4SDimitry Andric 
4704fe6060f1SDimitry Andric   // Avoid tail folding if the trip count is known to be a multiple of any VF
4705fe013be4SDimitry Andric   // we choose.
4706fe013be4SDimitry Andric   std::optional<unsigned> MaxPowerOf2RuntimeVF =
4707fe013be4SDimitry Andric       MaxFactors.FixedVF.getFixedValue();
4708fe013be4SDimitry Andric   if (MaxFactors.ScalableVF) {
4709fe013be4SDimitry Andric     std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
4710fe013be4SDimitry Andric     if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
4711fe013be4SDimitry Andric       MaxPowerOf2RuntimeVF = std::max<unsigned>(
4712fe013be4SDimitry Andric           *MaxPowerOf2RuntimeVF,
4713fe013be4SDimitry Andric           *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
4714fe013be4SDimitry Andric     } else
4715fe013be4SDimitry Andric       MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
4716fe013be4SDimitry Andric   }
4717fe013be4SDimitry Andric 
4718fe013be4SDimitry Andric   if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
4719fe013be4SDimitry Andric     assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
4720fe6060f1SDimitry Andric            "MaxFixedVF must be a power of 2");
4721fe013be4SDimitry Andric     unsigned MaxVFtimesIC =
4722fe013be4SDimitry Andric         UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF;
4723e8d8bef9SDimitry Andric     ScalarEvolution *SE = PSE.getSE();
4724e8d8bef9SDimitry Andric     const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
4725e8d8bef9SDimitry Andric     const SCEV *ExitCount = SE->getAddExpr(
4726e8d8bef9SDimitry Andric         BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
4727e8d8bef9SDimitry Andric     const SCEV *Rem = SE->getURemExpr(
4728fe6060f1SDimitry Andric         SE->applyLoopGuards(ExitCount, TheLoop),
4729fe6060f1SDimitry Andric         SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
4730e8d8bef9SDimitry Andric     if (Rem->isZero()) {
4731fe6060f1SDimitry Andric       // Accept MaxFixedVF if we do not have a tail.
47320b57cec5SDimitry Andric       LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4733fe6060f1SDimitry Andric       return MaxFactors;
47340b57cec5SDimitry Andric     }
4735fe6060f1SDimitry Andric   }
4736fe6060f1SDimitry Andric 
47370b57cec5SDimitry Andric   // If we don't know the precise trip count, or if the trip count that we
47380b57cec5SDimitry Andric   // found modulo the vectorization factor is not zero, try to fold the tail
47390b57cec5SDimitry Andric   // by masking.
47400b57cec5SDimitry Andric   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
47418bcb0991SDimitry Andric   if (Legal->prepareToFoldTailByMasking()) {
4742fe013be4SDimitry Andric     CanFoldTailByMasking = true;
4743fe6060f1SDimitry Andric     return MaxFactors;
47440b57cec5SDimitry Andric   }
47450b57cec5SDimitry Andric 
4746e8d8bef9SDimitry Andric   // If there was a tail-folding hint/switch, but we can't fold the tail by
4747e8d8bef9SDimitry Andric   // masking, fallback to a vectorization with a scalar epilogue.
4748e8d8bef9SDimitry Andric   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4749e8d8bef9SDimitry Andric     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4750e8d8bef9SDimitry Andric                          "scalar epilogue instead.\n");
4751e8d8bef9SDimitry Andric     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4752fe6060f1SDimitry Andric     return MaxFactors;
4753e8d8bef9SDimitry Andric   }
4754e8d8bef9SDimitry Andric 
4755e8d8bef9SDimitry Andric   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
4756e8d8bef9SDimitry Andric     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
4757fe6060f1SDimitry Andric     return FixedScalableVFPair::getNone();
4758e8d8bef9SDimitry Andric   }
4759e8d8bef9SDimitry Andric 
47600b57cec5SDimitry Andric   if (TC == 0) {
47618bcb0991SDimitry Andric     reportVectorizationFailure(
47628bcb0991SDimitry Andric         "Unable to calculate the loop count due to complex control flow",
47638bcb0991SDimitry Andric         "unable to calculate the loop count due to complex control flow",
47648bcb0991SDimitry Andric         "UnknownLoopCountComplexCFG", ORE, TheLoop);
4765fe6060f1SDimitry Andric     return FixedScalableVFPair::getNone();
47660b57cec5SDimitry Andric   }
47670b57cec5SDimitry Andric 
47688bcb0991SDimitry Andric   reportVectorizationFailure(
47698bcb0991SDimitry Andric       "Cannot optimize for size and vectorize at the same time.",
47708bcb0991SDimitry Andric       "cannot optimize for size and vectorize at the same time. "
47710b57cec5SDimitry Andric       "Enable vectorization of this loop with '#pragma clang loop "
47728bcb0991SDimitry Andric       "vectorize(enable)' when compiling with -Os/-Oz",
47738bcb0991SDimitry Andric       "NoTailLoopWithOptForSize", ORE, TheLoop);
4774fe6060f1SDimitry Andric   return FixedScalableVFPair::getNone();
47750b57cec5SDimitry Andric }
47760b57cec5SDimitry Andric 
getMaximizedVFForTarget(unsigned MaxTripCount,unsigned SmallestType,unsigned WidestType,ElementCount MaxSafeVF,bool FoldTailByMasking)4777fe6060f1SDimitry Andric ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
4778c9157d92SDimitry Andric     unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
477981ad6265SDimitry Andric     ElementCount MaxSafeVF, bool FoldTailByMasking) {
4780fe6060f1SDimitry Andric   bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
4781bdd1243dSDimitry Andric   const TypeSize WidestRegister = TTI.getRegisterBitWidth(
4782fe6060f1SDimitry Andric       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4783fe6060f1SDimitry Andric                            : TargetTransformInfo::RGK_FixedWidthVector);
4784e8d8bef9SDimitry Andric 
4785fe6060f1SDimitry Andric   // Convenience function to return the minimum of two ElementCounts.
4786fe6060f1SDimitry Andric   auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
4787fe6060f1SDimitry Andric     assert((LHS.isScalable() == RHS.isScalable()) &&
4788fe6060f1SDimitry Andric            "Scalable flags must match");
4789fe6060f1SDimitry Andric     return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
4790fe6060f1SDimitry Andric   };
47910b57cec5SDimitry Andric 
47925ffd83dbSDimitry Andric   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
47935ffd83dbSDimitry Andric   // Note that both WidestRegister and WidestType may not be a powers of 2.
4794fe6060f1SDimitry Andric   auto MaxVectorElementCount = ElementCount::get(
4795fe013be4SDimitry Andric       llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),
4796fe6060f1SDimitry Andric       ComputeScalableMaxVF);
4797fe6060f1SDimitry Andric   MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
47980b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4799fe6060f1SDimitry Andric                     << (MaxVectorElementCount * WidestType) << " bits.\n");
48000b57cec5SDimitry Andric 
4801fe6060f1SDimitry Andric   if (!MaxVectorElementCount) {
4802fe6060f1SDimitry Andric     LLVM_DEBUG(dbgs() << "LV: The target has no "
4803fe6060f1SDimitry Andric                       << (ComputeScalableMaxVF ? "scalable" : "fixed")
4804fe6060f1SDimitry Andric                       << " vector registers.\n");
4805fe6060f1SDimitry Andric     return ElementCount::getFixed(1);
48060b57cec5SDimitry Andric   }
48070b57cec5SDimitry Andric 
4808bdd1243dSDimitry Andric   unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
4809bdd1243dSDimitry Andric   if (MaxVectorElementCount.isScalable() &&
4810bdd1243dSDimitry Andric       TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
4811bdd1243dSDimitry Andric     auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
4812bdd1243dSDimitry Andric     auto Min = Attr.getVScaleRangeMin();
4813bdd1243dSDimitry Andric     WidestRegisterMinEC *= Min;
4814bdd1243dSDimitry Andric   }
4815fe013be4SDimitry Andric 
4816fe013be4SDimitry Andric   // When a scalar epilogue is required, at least one iteration of the scalar
4817c9157d92SDimitry Andric   // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
4818fe013be4SDimitry Andric   // max VF that results in a dead vector loop.
4819c9157d92SDimitry Andric   if (MaxTripCount > 0 && requiresScalarEpilogue(true))
4820c9157d92SDimitry Andric     MaxTripCount -= 1;
4821fe013be4SDimitry Andric 
4822c9157d92SDimitry Andric   if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
4823c9157d92SDimitry Andric       (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
4824c9157d92SDimitry Andric     // If upper bound loop trip count (TC) is known at compile time there is no
4825c9157d92SDimitry Andric     // point in choosing VF greater than TC (as done in the loop below). Select
4826c9157d92SDimitry Andric     // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
4827c9157d92SDimitry Andric     // scalable, we only fall back on a fixed VF when the TC is less than or
4828c9157d92SDimitry Andric     // equal to the known number of lanes.
4829c9157d92SDimitry Andric     auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount);
48300eae32dcSDimitry Andric     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
48310eae32dcSDimitry Andric                          "exceeding the constant trip count: "
4832c9157d92SDimitry Andric                       << ClampedUpperTripCount << "\n");
4833c9157d92SDimitry Andric     return ElementCount::get(
4834c9157d92SDimitry Andric         ClampedUpperTripCount,
4835c9157d92SDimitry Andric         FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
4836fe6060f1SDimitry Andric   }
4837fe6060f1SDimitry Andric 
483881ad6265SDimitry Andric   TargetTransformInfo::RegisterKind RegKind =
483981ad6265SDimitry Andric       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
484081ad6265SDimitry Andric                            : TargetTransformInfo::RGK_FixedWidthVector;
4841fe6060f1SDimitry Andric   ElementCount MaxVF = MaxVectorElementCount;
4842c9157d92SDimitry Andric   if (MaximizeBandwidth ||
4843c9157d92SDimitry Andric       (MaximizeBandwidth.getNumOccurrences() == 0 &&
4844c9157d92SDimitry Andric        (TTI.shouldMaximizeVectorBandwidth(RegKind) ||
4845c9157d92SDimitry Andric         (UseWiderVFIfCallVariantsPresent && Legal->hasVectorCallVariants())))) {
4846fe6060f1SDimitry Andric     auto MaxVectorElementCountMaxBW = ElementCount::get(
4847fe013be4SDimitry Andric         llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
4848fe6060f1SDimitry Andric         ComputeScalableMaxVF);
4849fe6060f1SDimitry Andric     MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
4850fe6060f1SDimitry Andric 
48510b57cec5SDimitry Andric     // Collect all viable vectorization factors larger than the default MaxVF
4852fe6060f1SDimitry Andric     // (i.e. MaxVectorElementCount).
4853e8d8bef9SDimitry Andric     SmallVector<ElementCount, 8> VFs;
4854fe6060f1SDimitry Andric     for (ElementCount VS = MaxVectorElementCount * 2;
4855fe6060f1SDimitry Andric          ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
4856fe6060f1SDimitry Andric       VFs.push_back(VS);
48570b57cec5SDimitry Andric 
48580b57cec5SDimitry Andric     // For each VF calculate its register usage.
48590b57cec5SDimitry Andric     auto RUs = calculateRegisterUsage(VFs);
48600b57cec5SDimitry Andric 
48610b57cec5SDimitry Andric     // Select the largest VF which doesn't require more registers than existing
48620b57cec5SDimitry Andric     // ones.
48630b57cec5SDimitry Andric     for (int i = RUs.size() - 1; i >= 0; --i) {
48648bcb0991SDimitry Andric       bool Selected = true;
48658bcb0991SDimitry Andric       for (auto &pair : RUs[i].MaxLocalUsers) {
48668bcb0991SDimitry Andric         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
48678bcb0991SDimitry Andric         if (pair.second > TargetNumRegisters)
48688bcb0991SDimitry Andric           Selected = false;
48698bcb0991SDimitry Andric       }
48708bcb0991SDimitry Andric       if (Selected) {
4871fe6060f1SDimitry Andric         MaxVF = VFs[i];
48720b57cec5SDimitry Andric         break;
48730b57cec5SDimitry Andric       }
48740b57cec5SDimitry Andric     }
4875fe6060f1SDimitry Andric     if (ElementCount MinVF =
4876fe6060f1SDimitry Andric             TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
4877fe6060f1SDimitry Andric       if (ElementCount::isKnownLT(MaxVF, MinVF)) {
48780b57cec5SDimitry Andric         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
48790b57cec5SDimitry Andric                           << ") with target's minimum: " << MinVF << '\n');
48800b57cec5SDimitry Andric         MaxVF = MinVF;
48810b57cec5SDimitry Andric       }
48820b57cec5SDimitry Andric     }
488381ad6265SDimitry Andric 
488481ad6265SDimitry Andric     // Invalidate any widening decisions we might have made, in case the loop
488581ad6265SDimitry Andric     // requires prediction (decided later), but we have already made some
488681ad6265SDimitry Andric     // load/store widening decisions.
488781ad6265SDimitry Andric     invalidateCostModelingDecisions();
48880b57cec5SDimitry Andric   }
4889fe6060f1SDimitry Andric   return MaxVF;
48900b57cec5SDimitry Andric }
48910b57cec5SDimitry Andric 
4892fe013be4SDimitry Andric /// Convenience function that returns the value of vscale_range iff
4893fe013be4SDimitry Andric /// vscale_range.min == vscale_range.max or otherwise returns the value
4894fe013be4SDimitry Andric /// returned by the corresponding TTI method.
4895fe013be4SDimitry Andric static std::optional<unsigned>
getVScaleForTuning(const Loop * L,const TargetTransformInfo & TTI)4896fe013be4SDimitry Andric getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) {
4897fe013be4SDimitry Andric   const Function *Fn = L->getHeader()->getParent();
4898fe013be4SDimitry Andric   if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
4899fe013be4SDimitry Andric     auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
4900d56accc7SDimitry Andric     auto Min = Attr.getVScaleRangeMin();
4901d56accc7SDimitry Andric     auto Max = Attr.getVScaleRangeMax();
4902d56accc7SDimitry Andric     if (Max && Min == Max)
4903d56accc7SDimitry Andric       return Max;
4904d56accc7SDimitry Andric   }
4905d56accc7SDimitry Andric 
4906d56accc7SDimitry Andric   return TTI.getVScaleForTuning();
4907d56accc7SDimitry Andric }
4908d56accc7SDimitry Andric 
isMoreProfitable(const VectorizationFactor & A,const VectorizationFactor & B) const4909fe013be4SDimitry Andric bool LoopVectorizationPlanner::isMoreProfitable(
4910fe6060f1SDimitry Andric     const VectorizationFactor &A, const VectorizationFactor &B) const {
4911fe6060f1SDimitry Andric   InstructionCost CostA = A.Cost;
4912fe6060f1SDimitry Andric   InstructionCost CostB = B.Cost;
4913e8d8bef9SDimitry Andric 
4914fe013be4SDimitry Andric   unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop);
4915fe6060f1SDimitry Andric 
4916fe013be4SDimitry Andric   if (!A.Width.isScalable() && !B.Width.isScalable() && MaxTripCount) {
4917fe013be4SDimitry Andric     // If the trip count is a known (possibly small) constant, the trip count
4918fe013be4SDimitry Andric     // will be rounded up to an integer number of iterations under
4919fe013be4SDimitry Andric     // FoldTailByMasking. The total cost in that case will be
4920fe013be4SDimitry Andric     // VecCost*ceil(TripCount/VF). When not folding the tail, the total
4921fe013be4SDimitry Andric     // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
4922fe013be4SDimitry Andric     // some extra overheads, but for the purpose of comparing the costs of
4923fe013be4SDimitry Andric     // different VFs we can use this to compare the total loop-body cost
4924fe013be4SDimitry Andric     // expected after vectorization.
4925fe013be4SDimitry Andric     auto GetCostForTC = [MaxTripCount, this](unsigned VF,
4926fe013be4SDimitry Andric                                              InstructionCost VectorCost,
4927fe013be4SDimitry Andric                                              InstructionCost ScalarCost) {
4928fe013be4SDimitry Andric       return CM.foldTailByMasking() ? VectorCost * divideCeil(MaxTripCount, VF)
4929fe013be4SDimitry Andric                                     : VectorCost * (MaxTripCount / VF) +
4930fe013be4SDimitry Andric                                           ScalarCost * (MaxTripCount % VF);
4931fe013be4SDimitry Andric     };
4932fe013be4SDimitry Andric     auto RTCostA = GetCostForTC(A.Width.getFixedValue(), CostA, A.ScalarCost);
4933fe013be4SDimitry Andric     auto RTCostB = GetCostForTC(B.Width.getFixedValue(), CostB, B.ScalarCost);
4934fe013be4SDimitry Andric 
4935fe6060f1SDimitry Andric     return RTCostA < RTCostB;
4936fe6060f1SDimitry Andric   }
4937fe6060f1SDimitry Andric 
4938349cc55cSDimitry Andric   // Improve estimate for the vector width if it is scalable.
4939349cc55cSDimitry Andric   unsigned EstimatedWidthA = A.Width.getKnownMinValue();
4940349cc55cSDimitry Andric   unsigned EstimatedWidthB = B.Width.getKnownMinValue();
4941fe013be4SDimitry Andric   if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) {
4942349cc55cSDimitry Andric     if (A.Width.isScalable())
4943bdd1243dSDimitry Andric       EstimatedWidthA *= *VScale;
4944349cc55cSDimitry Andric     if (B.Width.isScalable())
4945bdd1243dSDimitry Andric       EstimatedWidthB *= *VScale;
4946349cc55cSDimitry Andric   }
4947349cc55cSDimitry Andric 
49480eae32dcSDimitry Andric   // Assume vscale may be larger than 1 (or the value being tuned for),
49490eae32dcSDimitry Andric   // so that scalable vectorization is slightly favorable over fixed-width
49500eae32dcSDimitry Andric   // vectorization.
4951fe6060f1SDimitry Andric   if (A.Width.isScalable() && !B.Width.isScalable())
4952349cc55cSDimitry Andric     return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
4953fe6060f1SDimitry Andric 
4954fe6060f1SDimitry Andric   // To avoid the need for FP division:
4955fe6060f1SDimitry Andric   //      (CostA / A.Width) < (CostB / B.Width)
4956fe6060f1SDimitry Andric   // <=>  (CostA * B.Width) < (CostB * A.Width)
4957349cc55cSDimitry Andric   return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
4958fe6060f1SDimitry Andric }
4959fe6060f1SDimitry Andric 
emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,OptimizationRemarkEmitter * ORE,Loop * TheLoop)4960fe013be4SDimitry Andric static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
4961fe013be4SDimitry Andric                                    OptimizationRemarkEmitter *ORE,
4962fe013be4SDimitry Andric                                    Loop *TheLoop) {
4963fe013be4SDimitry Andric   if (InvalidCosts.empty())
4964fe013be4SDimitry Andric     return;
4965e8d8bef9SDimitry Andric 
4966fe6060f1SDimitry Andric   // Emit a report of VFs with invalid costs in the loop.
4967fe013be4SDimitry Andric 
4968fe6060f1SDimitry Andric   // Group the remarks per instruction, keeping the instruction order from
4969fe6060f1SDimitry Andric   // InvalidCosts.
4970fe6060f1SDimitry Andric   std::map<Instruction *, unsigned> Numbering;
4971fe6060f1SDimitry Andric   unsigned I = 0;
4972fe6060f1SDimitry Andric   for (auto &Pair : InvalidCosts)
4973fe6060f1SDimitry Andric     if (!Numbering.count(Pair.first))
4974fe6060f1SDimitry Andric       Numbering[Pair.first] = I++;
4975fe6060f1SDimitry Andric 
4976fe6060f1SDimitry Andric   // Sort the list, first on instruction(number) then on VF.
4977fe013be4SDimitry Andric   sort(InvalidCosts, [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
4978fe6060f1SDimitry Andric     if (Numbering[A.first] != Numbering[B.first])
4979fe6060f1SDimitry Andric       return Numbering[A.first] < Numbering[B.first];
4980fe6060f1SDimitry Andric     ElementCountComparator ECC;
4981fe6060f1SDimitry Andric     return ECC(A.second, B.second);
4982fe6060f1SDimitry Andric   });
4983fe6060f1SDimitry Andric 
4984fe6060f1SDimitry Andric   // For a list of ordered instruction-vf pairs:
4985fe6060f1SDimitry Andric   //   [(load, vf1), (load, vf2), (store, vf1)]
4986fe6060f1SDimitry Andric   // Group the instructions together to emit separate remarks for:
4987fe6060f1SDimitry Andric   //   load  (vf1, vf2)
4988fe6060f1SDimitry Andric   //   store (vf1)
4989fe6060f1SDimitry Andric   auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
4990fe6060f1SDimitry Andric   auto Subset = ArrayRef<InstructionVFPair>();
4991fe6060f1SDimitry Andric   do {
4992fe6060f1SDimitry Andric     if (Subset.empty())
4993fe6060f1SDimitry Andric       Subset = Tail.take_front(1);
4994fe6060f1SDimitry Andric 
4995fe6060f1SDimitry Andric     Instruction *I = Subset.front().first;
4996fe6060f1SDimitry Andric 
4997fe6060f1SDimitry Andric     // If the next instruction is different, or if there are no other pairs,
4998fe6060f1SDimitry Andric     // emit a remark for the collated subset. e.g.
4999fe6060f1SDimitry Andric     //   [(load, vf1), (load, vf2))]
5000fe6060f1SDimitry Andric     // to emit:
5001fe6060f1SDimitry Andric     //  remark: invalid costs for 'load' at VF=(vf, vf2)
5002fe6060f1SDimitry Andric     if (Subset == Tail || Tail[Subset.size()].first != I) {
5003fe6060f1SDimitry Andric       std::string OutString;
5004fe6060f1SDimitry Andric       raw_string_ostream OS(OutString);
5005fe6060f1SDimitry Andric       assert(!Subset.empty() && "Unexpected empty range");
5006fe6060f1SDimitry Andric       OS << "Instruction with invalid costs prevented vectorization at VF=(";
5007bdd1243dSDimitry Andric       for (const auto &Pair : Subset)
5008fe013be4SDimitry Andric         OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
5009fe6060f1SDimitry Andric       OS << "):";
5010fe6060f1SDimitry Andric       if (auto *CI = dyn_cast<CallInst>(I))
5011fe6060f1SDimitry Andric         OS << " call to " << CI->getCalledFunction()->getName();
5012fe6060f1SDimitry Andric       else
5013fe6060f1SDimitry Andric         OS << " " << I->getOpcodeName();
5014fe6060f1SDimitry Andric       OS.flush();
5015fe6060f1SDimitry Andric       reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
5016fe6060f1SDimitry Andric       Tail = Tail.drop_front(Subset.size());
5017fe6060f1SDimitry Andric       Subset = {};
5018fe6060f1SDimitry Andric     } else
5019fe6060f1SDimitry Andric       // Grow the subset by one element
5020fe6060f1SDimitry Andric       Subset = Tail.take_front(Subset.size() + 1);
5021fe6060f1SDimitry Andric   } while (!Tail.empty());
50220b57cec5SDimitry Andric }
50230b57cec5SDimitry Andric 
selectVectorizationFactor(const ElementCountSet & VFCandidates)5024fe013be4SDimitry Andric VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor(
5025fe013be4SDimitry Andric     const ElementCountSet &VFCandidates) {
5026fe013be4SDimitry Andric   InstructionCost ExpectedCost =
5027fe013be4SDimitry Andric       CM.expectedCost(ElementCount::getFixed(1)).first;
5028fe013be4SDimitry Andric   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
5029fe013be4SDimitry Andric   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
5030fe013be4SDimitry Andric   assert(VFCandidates.count(ElementCount::getFixed(1)) &&
5031fe013be4SDimitry Andric          "Expected Scalar VF to be a candidate");
5032fe013be4SDimitry Andric 
5033fe013be4SDimitry Andric   const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
5034fe013be4SDimitry Andric                                        ExpectedCost);
5035fe013be4SDimitry Andric   VectorizationFactor ChosenFactor = ScalarCost;
5036fe013be4SDimitry Andric 
5037fe013be4SDimitry Andric   bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
5038fe013be4SDimitry Andric   if (ForceVectorization && VFCandidates.size() > 1) {
5039fe013be4SDimitry Andric     // Ignore scalar width, because the user explicitly wants vectorization.
5040fe013be4SDimitry Andric     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5041fe013be4SDimitry Andric     // evaluation.
5042fe013be4SDimitry Andric     ChosenFactor.Cost = InstructionCost::getMax();
5043fe013be4SDimitry Andric   }
5044fe013be4SDimitry Andric 
5045fe013be4SDimitry Andric   SmallVector<InstructionVFPair> InvalidCosts;
5046fe013be4SDimitry Andric   for (const auto &i : VFCandidates) {
5047fe013be4SDimitry Andric     // The cost for scalar VF=1 is already calculated, so ignore it.
5048fe013be4SDimitry Andric     if (i.isScalar())
5049fe013be4SDimitry Andric       continue;
5050fe013be4SDimitry Andric 
5051fe013be4SDimitry Andric     LoopVectorizationCostModel::VectorizationCostTy C =
5052fe013be4SDimitry Andric         CM.expectedCost(i, &InvalidCosts);
5053fe013be4SDimitry Andric     VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost);
5054fe013be4SDimitry Andric 
5055fe013be4SDimitry Andric #ifndef NDEBUG
50566c20abcdSDimitry Andric     unsigned AssumedMinimumVscale =
50576c20abcdSDimitry Andric         getVScaleForTuning(OrigLoop, TTI).value_or(1);
5058fe013be4SDimitry Andric     unsigned Width =
5059fe013be4SDimitry Andric         Candidate.Width.isScalable()
5060fe013be4SDimitry Andric             ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
5061fe013be4SDimitry Andric             : Candidate.Width.getFixedValue();
5062fe013be4SDimitry Andric     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5063fe013be4SDimitry Andric                       << " costs: " << (Candidate.Cost / Width));
5064fe013be4SDimitry Andric     if (i.isScalable())
5065fe013be4SDimitry Andric       LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
5066fe013be4SDimitry Andric                         << AssumedMinimumVscale << ")");
5067fe013be4SDimitry Andric     LLVM_DEBUG(dbgs() << ".\n");
5068fe013be4SDimitry Andric #endif
5069fe013be4SDimitry Andric 
5070fe013be4SDimitry Andric     if (!C.second && !ForceVectorization) {
5071fe013be4SDimitry Andric       LLVM_DEBUG(
5072fe013be4SDimitry Andric           dbgs() << "LV: Not considering vector loop of width " << i
5073fe013be4SDimitry Andric                  << " because it will not generate any vector instructions.\n");
5074fe013be4SDimitry Andric       continue;
5075fe013be4SDimitry Andric     }
5076fe013be4SDimitry Andric 
5077fe013be4SDimitry Andric     // If profitable add it to ProfitableVF list.
5078fe013be4SDimitry Andric     if (isMoreProfitable(Candidate, ScalarCost))
5079fe013be4SDimitry Andric       ProfitableVFs.push_back(Candidate);
5080fe013be4SDimitry Andric 
5081fe013be4SDimitry Andric     if (isMoreProfitable(Candidate, ChosenFactor))
5082fe013be4SDimitry Andric       ChosenFactor = Candidate;
5083fe013be4SDimitry Andric   }
5084fe013be4SDimitry Andric 
5085fe013be4SDimitry Andric   emitInvalidCostRemarks(InvalidCosts, ORE, OrigLoop);
5086fe013be4SDimitry Andric 
5087fe013be4SDimitry Andric   if (!EnableCondStoresVectorization && CM.hasPredStores()) {
5088fe013be4SDimitry Andric     reportVectorizationFailure(
5089fe013be4SDimitry Andric         "There are conditional stores.",
50908bcb0991SDimitry Andric         "store that is conditionally executed prevents vectorization",
5091fe013be4SDimitry Andric         "ConditionalStore", ORE, OrigLoop);
5092fe6060f1SDimitry Andric     ChosenFactor = ScalarCost;
50930b57cec5SDimitry Andric   }
50940b57cec5SDimitry Andric 
5095fe6060f1SDimitry Andric   LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
5096fcaf7f86SDimitry Andric                  !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
50970b57cec5SDimitry Andric              << "LV: Vectorization seems to be not beneficial, "
50980b57cec5SDimitry Andric              << "but was forced by a user.\n");
5099fe6060f1SDimitry Andric   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
5100fe6060f1SDimitry Andric   return ChosenFactor;
51010b57cec5SDimitry Andric }
51020b57cec5SDimitry Andric 
isCandidateForEpilogueVectorization(ElementCount VF) const5103fe013be4SDimitry Andric bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
5104fe013be4SDimitry Andric     ElementCount VF) const {
5105e8d8bef9SDimitry Andric   // Cross iteration phis such as reductions need special handling and are
5106e8d8bef9SDimitry Andric   // currently unsupported.
5107fe013be4SDimitry Andric   if (any_of(OrigLoop->getHeader()->phis(),
5108bdd1243dSDimitry Andric              [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
5109e8d8bef9SDimitry Andric     return false;
5110e8d8bef9SDimitry Andric 
5111e8d8bef9SDimitry Andric   // Phis with uses outside of the loop require special handling and are
5112e8d8bef9SDimitry Andric   // currently unsupported.
5113bdd1243dSDimitry Andric   for (const auto &Entry : Legal->getInductionVars()) {
5114e8d8bef9SDimitry Andric     // Look for uses of the value of the induction at the last iteration.
5115fe013be4SDimitry Andric     Value *PostInc =
5116fe013be4SDimitry Andric         Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch());
5117e8d8bef9SDimitry Andric     for (User *U : PostInc->users())
5118fe013be4SDimitry Andric       if (!OrigLoop->contains(cast<Instruction>(U)))
5119e8d8bef9SDimitry Andric         return false;
5120e8d8bef9SDimitry Andric     // Look for uses of penultimate value of the induction.
5121e8d8bef9SDimitry Andric     for (User *U : Entry.first->users())
5122fe013be4SDimitry Andric       if (!OrigLoop->contains(cast<Instruction>(U)))
5123e8d8bef9SDimitry Andric         return false;
5124e8d8bef9SDimitry Andric   }
5125e8d8bef9SDimitry Andric 
5126fe6060f1SDimitry Andric   // Epilogue vectorization code has not been auditted to ensure it handles
5127fe6060f1SDimitry Andric   // non-latch exits properly.  It may be fine, but it needs auditted and
5128fe6060f1SDimitry Andric   // tested.
5129fe013be4SDimitry Andric   if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
5130fe6060f1SDimitry Andric     return false;
5131fe6060f1SDimitry Andric 
5132e8d8bef9SDimitry Andric   return true;
5133e8d8bef9SDimitry Andric }
5134e8d8bef9SDimitry Andric 
isEpilogueVectorizationProfitable(const ElementCount VF) const5135e8d8bef9SDimitry Andric bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5136e8d8bef9SDimitry Andric     const ElementCount VF) const {
5137e8d8bef9SDimitry Andric   // FIXME: We need a much better cost-model to take different parameters such
5138e8d8bef9SDimitry Andric   // as register pressure, code size increase and cost of extra branches into
5139e8d8bef9SDimitry Andric   // account. For now we apply a very crude heuristic and only consider loops
5140e8d8bef9SDimitry Andric   // with vectorization factors larger than a certain value.
5141bdd1243dSDimitry Andric 
5142bdd1243dSDimitry Andric   // Allow the target to opt out entirely.
5143bdd1243dSDimitry Andric   if (!TTI.preferEpilogueVectorization())
5144bdd1243dSDimitry Andric     return false;
5145bdd1243dSDimitry Andric 
5146e8d8bef9SDimitry Andric   // We also consider epilogue vectorization unprofitable for targets that don't
5147e8d8bef9SDimitry Andric   // consider interleaving beneficial (eg. MVE).
5148fe013be4SDimitry Andric   if (TTI.getMaxInterleaveFactor(VF) <= 1)
5149e8d8bef9SDimitry Andric     return false;
5150fe013be4SDimitry Andric 
5151fe013be4SDimitry Andric   unsigned Multiplier = 1;
5152fe013be4SDimitry Andric   if (VF.isScalable())
5153fe013be4SDimitry Andric     Multiplier = getVScaleForTuning(TheLoop, TTI).value_or(1);
5154fe013be4SDimitry Andric   if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF)
5155e8d8bef9SDimitry Andric     return true;
5156e8d8bef9SDimitry Andric   return false;
5157e8d8bef9SDimitry Andric }
5158e8d8bef9SDimitry Andric 
selectEpilogueVectorizationFactor(const ElementCount MainLoopVF,unsigned IC)5159fe013be4SDimitry Andric VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
5160fe013be4SDimitry Andric     const ElementCount MainLoopVF, unsigned IC) {
5161e8d8bef9SDimitry Andric   VectorizationFactor Result = VectorizationFactor::Disabled();
5162e8d8bef9SDimitry Andric   if (!EnableEpilogueVectorization) {
5163fe013be4SDimitry Andric     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
5164e8d8bef9SDimitry Andric     return Result;
5165e8d8bef9SDimitry Andric   }
5166e8d8bef9SDimitry Andric 
5167fe013be4SDimitry Andric   if (!CM.isScalarEpilogueAllowed()) {
5168fe013be4SDimitry Andric     LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
5169fe013be4SDimitry Andric                          "epilogue is allowed.\n");
5170e8d8bef9SDimitry Andric     return Result;
5171e8d8bef9SDimitry Andric   }
5172e8d8bef9SDimitry Andric 
5173e8d8bef9SDimitry Andric   // Not really a cost consideration, but check for unsupported cases here to
5174e8d8bef9SDimitry Andric   // simplify the logic.
5175fe013be4SDimitry Andric   if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
5176fe013be4SDimitry Andric     LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
5177fe013be4SDimitry Andric                          "is not a supported candidate.\n");
5178e8d8bef9SDimitry Andric     return Result;
5179e8d8bef9SDimitry Andric   }
5180e8d8bef9SDimitry Andric 
5181e8d8bef9SDimitry Andric   if (EpilogueVectorizationForceVF > 1) {
5182fe013be4SDimitry Andric     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
5183349cc55cSDimitry Andric     ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
5184fe013be4SDimitry Andric     if (hasPlanWithVF(ForcedEC))
518581ad6265SDimitry Andric       return {ForcedEC, 0, 0};
5186e8d8bef9SDimitry Andric     else {
5187fe013be4SDimitry Andric       LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
5188fe013be4SDimitry Andric                            "viable.\n");
5189e8d8bef9SDimitry Andric       return Result;
5190e8d8bef9SDimitry Andric     }
5191e8d8bef9SDimitry Andric   }
5192e8d8bef9SDimitry Andric 
5193fe013be4SDimitry Andric   if (OrigLoop->getHeader()->getParent()->hasOptSize() ||
5194fe013be4SDimitry Andric       OrigLoop->getHeader()->getParent()->hasMinSize()) {
5195e8d8bef9SDimitry Andric     LLVM_DEBUG(
5196fe013be4SDimitry Andric         dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
5197e8d8bef9SDimitry Andric     return Result;
5198e8d8bef9SDimitry Andric   }
5199e8d8bef9SDimitry Andric 
5200fe013be4SDimitry Andric   if (!CM.isEpilogueVectorizationProfitable(MainLoopVF)) {
5201349cc55cSDimitry Andric     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
5202349cc55cSDimitry Andric                          "this loop\n");
5203e8d8bef9SDimitry Andric     return Result;
5204349cc55cSDimitry Andric   }
5205e8d8bef9SDimitry Andric 
5206d56accc7SDimitry Andric   // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
5207d56accc7SDimitry Andric   // the main loop handles 8 lanes per iteration. We could still benefit from
5208d56accc7SDimitry Andric   // vectorizing the epilogue loop with VF=4.
5209d56accc7SDimitry Andric   ElementCount EstimatedRuntimeVF = MainLoopVF;
5210d56accc7SDimitry Andric   if (MainLoopVF.isScalable()) {
5211d56accc7SDimitry Andric     EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
5212fe013be4SDimitry Andric     if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI))
521381ad6265SDimitry Andric       EstimatedRuntimeVF *= *VScale;
5214d56accc7SDimitry Andric   }
5215d56accc7SDimitry Andric 
5216fe013be4SDimitry Andric   ScalarEvolution &SE = *PSE.getSE();
5217fe013be4SDimitry Andric   Type *TCType = Legal->getWidestInductionType();
5218fe013be4SDimitry Andric   const SCEV *RemainingIterations = nullptr;
5219fe013be4SDimitry Andric   for (auto &NextVF : ProfitableVFs) {
5220fe013be4SDimitry Andric     // Skip candidate VFs without a corresponding VPlan.
5221fe013be4SDimitry Andric     if (!hasPlanWithVF(NextVF.Width))
5222fe013be4SDimitry Andric       continue;
5223fe013be4SDimitry Andric 
5224fe013be4SDimitry Andric     // Skip candidate VFs with widths >= the estimate runtime VF (scalable
5225fe013be4SDimitry Andric     // vectors) or the VF of the main loop (fixed vectors).
5226fe013be4SDimitry Andric     if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
5227fe013be4SDimitry Andric          ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||
5228fe013be4SDimitry Andric         ElementCount::isKnownGE(NextVF.Width, MainLoopVF))
5229fe013be4SDimitry Andric       continue;
5230fe013be4SDimitry Andric 
5231fe013be4SDimitry Andric     // If NextVF is greater than the number of remaining iterations, the
5232fe013be4SDimitry Andric     // epilogue loop would be dead. Skip such factors.
5233fe013be4SDimitry Andric     if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) {
5234fe013be4SDimitry Andric       // TODO: extend to support scalable VFs.
5235fe013be4SDimitry Andric       if (!RemainingIterations) {
5236fe013be4SDimitry Andric         const SCEV *TC = createTripCountSCEV(TCType, PSE, OrigLoop);
5237fe013be4SDimitry Andric         RemainingIterations = SE.getURemExpr(
5238fe013be4SDimitry Andric             TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC));
5239fe013be4SDimitry Andric       }
5240fe013be4SDimitry Andric       if (SE.isKnownPredicate(
5241fe013be4SDimitry Andric               CmpInst::ICMP_UGT,
5242fe013be4SDimitry Andric               SE.getConstant(TCType, NextVF.Width.getKnownMinValue()),
5243fe013be4SDimitry Andric               RemainingIterations))
5244fe013be4SDimitry Andric         continue;
5245fe013be4SDimitry Andric     }
5246fe013be4SDimitry Andric 
5247fe013be4SDimitry Andric     if (Result.Width.isScalar() || isMoreProfitable(NextVF, Result))
5248e8d8bef9SDimitry Andric       Result = NextVF;
5249fe013be4SDimitry Andric   }
5250e8d8bef9SDimitry Andric 
5251e8d8bef9SDimitry Andric   if (Result != VectorizationFactor::Disabled())
5252e8d8bef9SDimitry Andric     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5253fe013be4SDimitry Andric                       << Result.Width << "\n");
5254e8d8bef9SDimitry Andric   return Result;
5255e8d8bef9SDimitry Andric }
5256e8d8bef9SDimitry Andric 
52570b57cec5SDimitry Andric std::pair<unsigned, unsigned>
getSmallestAndWidestTypes()52580b57cec5SDimitry Andric LoopVectorizationCostModel::getSmallestAndWidestTypes() {
52590b57cec5SDimitry Andric   unsigned MinWidth = -1U;
52600b57cec5SDimitry Andric   unsigned MaxWidth = 8;
52610b57cec5SDimitry Andric   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
526204eeddc0SDimitry Andric   // For in-loop reductions, no element types are added to ElementTypesInLoop
526304eeddc0SDimitry Andric   // if there are no loads/stores in the loop. In this case, check through the
526404eeddc0SDimitry Andric   // reduction variables to determine the maximum width.
526504eeddc0SDimitry Andric   if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
526604eeddc0SDimitry Andric     // Reset MaxWidth so that we can find the smallest type used by recurrences
526704eeddc0SDimitry Andric     // in the loop.
526804eeddc0SDimitry Andric     MaxWidth = -1U;
5269bdd1243dSDimitry Andric     for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
527004eeddc0SDimitry Andric       const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
527104eeddc0SDimitry Andric       // When finding the min width used by the recurrence we need to account
527204eeddc0SDimitry Andric       // for casts on the input operands of the recurrence.
527304eeddc0SDimitry Andric       MaxWidth = std::min<unsigned>(
527404eeddc0SDimitry Andric           MaxWidth, std::min<unsigned>(
527504eeddc0SDimitry Andric                         RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
527604eeddc0SDimitry Andric                         RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
527704eeddc0SDimitry Andric     }
527804eeddc0SDimitry Andric   } else {
5279fe6060f1SDimitry Andric     for (Type *T : ElementTypesInLoop) {
5280fe6060f1SDimitry Andric       MinWidth = std::min<unsigned>(
5281bdd1243dSDimitry Andric           MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
5282fe6060f1SDimitry Andric       MaxWidth = std::max<unsigned>(
5283bdd1243dSDimitry Andric           MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
5284fe6060f1SDimitry Andric     }
528504eeddc0SDimitry Andric   }
5286fe6060f1SDimitry Andric   return {MinWidth, MaxWidth};
5287fe6060f1SDimitry Andric }
52880b57cec5SDimitry Andric 
collectElementTypesForWidening()5289fe6060f1SDimitry Andric void LoopVectorizationCostModel::collectElementTypesForWidening() {
5290fe6060f1SDimitry Andric   ElementTypesInLoop.clear();
52910b57cec5SDimitry Andric   // For each block.
52920b57cec5SDimitry Andric   for (BasicBlock *BB : TheLoop->blocks()) {
52930b57cec5SDimitry Andric     // For each instruction in the loop.
52940b57cec5SDimitry Andric     for (Instruction &I : BB->instructionsWithoutDebug()) {
52950b57cec5SDimitry Andric       Type *T = I.getType();
52960b57cec5SDimitry Andric 
52970b57cec5SDimitry Andric       // Skip ignored values.
52985ffd83dbSDimitry Andric       if (ValuesToIgnore.count(&I))
52990b57cec5SDimitry Andric         continue;
53000b57cec5SDimitry Andric 
53010b57cec5SDimitry Andric       // Only examine Loads, Stores and PHINodes.
53020b57cec5SDimitry Andric       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
53030b57cec5SDimitry Andric         continue;
53040b57cec5SDimitry Andric 
53050b57cec5SDimitry Andric       // Examine PHI nodes that are reduction variables. Update the type to
53060b57cec5SDimitry Andric       // account for the recurrence type.
53070b57cec5SDimitry Andric       if (auto *PN = dyn_cast<PHINode>(&I)) {
53080b57cec5SDimitry Andric         if (!Legal->isReductionVariable(PN))
53090b57cec5SDimitry Andric           continue;
53100eae32dcSDimitry Andric         const RecurrenceDescriptor &RdxDesc =
53110eae32dcSDimitry Andric             Legal->getReductionVars().find(PN)->second;
5312fe6060f1SDimitry Andric         if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
5313e8d8bef9SDimitry Andric             TTI.preferInLoopReduction(RdxDesc.getOpcode(),
5314e8d8bef9SDimitry Andric                                       RdxDesc.getRecurrenceType(),
5315e8d8bef9SDimitry Andric                                       TargetTransformInfo::ReductionFlags()))
5316e8d8bef9SDimitry Andric           continue;
53170b57cec5SDimitry Andric         T = RdxDesc.getRecurrenceType();
53180b57cec5SDimitry Andric       }
53190b57cec5SDimitry Andric 
53200b57cec5SDimitry Andric       // Examine the stored values.
53210b57cec5SDimitry Andric       if (auto *ST = dyn_cast<StoreInst>(&I))
53220b57cec5SDimitry Andric         T = ST->getValueOperand()->getType();
53230b57cec5SDimitry Andric 
532404eeddc0SDimitry Andric       assert(T->isSized() &&
532504eeddc0SDimitry Andric              "Expected the load/store/recurrence type to be sized");
53260b57cec5SDimitry Andric 
5327fe6060f1SDimitry Andric       ElementTypesInLoop.insert(T);
53280b57cec5SDimitry Andric     }
53290b57cec5SDimitry Andric   }
53300b57cec5SDimitry Andric }
53310b57cec5SDimitry Andric 
5332bdd1243dSDimitry Andric unsigned
selectInterleaveCount(ElementCount VF,InstructionCost LoopCost)5333bdd1243dSDimitry Andric LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5334bdd1243dSDimitry Andric                                                   InstructionCost LoopCost) {
53350b57cec5SDimitry Andric   // -- The interleave heuristics --
53360b57cec5SDimitry Andric   // We interleave the loop in order to expose ILP and reduce the loop overhead.
53370b57cec5SDimitry Andric   // There are many micro-architectural considerations that we can't predict
53380b57cec5SDimitry Andric   // at this level. For example, frontend pressure (on decode or fetch) due to
53390b57cec5SDimitry Andric   // code size, or the number and capabilities of the execution ports.
53400b57cec5SDimitry Andric   //
53410b57cec5SDimitry Andric   // We use the following heuristics to select the interleave count:
53420b57cec5SDimitry Andric   // 1. If the code has reductions, then we interleave to break the cross
53430b57cec5SDimitry Andric   // iteration dependency.
53440b57cec5SDimitry Andric   // 2. If the loop is really small, then we interleave to reduce the loop
53450b57cec5SDimitry Andric   // overhead.
53460b57cec5SDimitry Andric   // 3. We don't interleave if we think that we will spill registers to memory
53470b57cec5SDimitry Andric   // due to the increased register pressure.
53480b57cec5SDimitry Andric 
53498bcb0991SDimitry Andric   if (!isScalarEpilogueAllowed())
53500b57cec5SDimitry Andric     return 1;
53510b57cec5SDimitry Andric 
53520b57cec5SDimitry Andric   // We used the distance for the interleave count.
5353fe013be4SDimitry Andric   if (!Legal->isSafeForAnyVectorWidth())
53540b57cec5SDimitry Andric     return 1;
53550b57cec5SDimitry Andric 
5356480093f4SDimitry Andric   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5357e8d8bef9SDimitry Andric   const bool HasReductions = !Legal->getReductionVars().empty();
5358e8d8bef9SDimitry Andric   // Do not interleave loops with a relatively small known or estimated trip
5359e8d8bef9SDimitry Andric   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
5360e8d8bef9SDimitry Andric   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
5361e8d8bef9SDimitry Andric   // because with the above conditions interleaving can expose ILP and break
5362e8d8bef9SDimitry Andric   // cross iteration dependences for reductions.
5363e8d8bef9SDimitry Andric   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
5364e8d8bef9SDimitry Andric       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
53650b57cec5SDimitry Andric     return 1;
53660b57cec5SDimitry Andric 
53673a9a9c0cSDimitry Andric   // If we did not calculate the cost for VF (because the user selected the VF)
53683a9a9c0cSDimitry Andric   // then we calculate the cost of VF here.
53693a9a9c0cSDimitry Andric   if (LoopCost == 0) {
5370bdd1243dSDimitry Andric     LoopCost = expectedCost(VF).first;
5371bdd1243dSDimitry Andric     assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
53723a9a9c0cSDimitry Andric 
53733a9a9c0cSDimitry Andric     // Loop body is free and there is no need for interleaving.
53743a9a9c0cSDimitry Andric     if (LoopCost == 0)
53753a9a9c0cSDimitry Andric       return 1;
53763a9a9c0cSDimitry Andric   }
53773a9a9c0cSDimitry Andric 
53780b57cec5SDimitry Andric   RegisterUsage R = calculateRegisterUsage({VF})[0];
53790b57cec5SDimitry Andric   // We divide by these constants so assume that we have at least one
53800b57cec5SDimitry Andric   // instruction that uses at least one register.
53818bcb0991SDimitry Andric   for (auto& pair : R.MaxLocalUsers) {
53828bcb0991SDimitry Andric     pair.second = std::max(pair.second, 1U);
53838bcb0991SDimitry Andric   }
53840b57cec5SDimitry Andric 
53850b57cec5SDimitry Andric   // We calculate the interleave count using the following formula.
53860b57cec5SDimitry Andric   // Subtract the number of loop invariants from the number of available
53870b57cec5SDimitry Andric   // registers. These registers are used by all of the interleaved instances.
53880b57cec5SDimitry Andric   // Next, divide the remaining registers by the number of registers that is
53890b57cec5SDimitry Andric   // required by the loop, in order to estimate how many parallel instances
53900b57cec5SDimitry Andric   // fit without causing spills. All of this is rounded down if necessary to be
53910b57cec5SDimitry Andric   // a power of two. We want power of two interleave count to simplify any
53920b57cec5SDimitry Andric   // addressing operations or alignment considerations.
53930b57cec5SDimitry Andric   // We also want power of two interleave counts to ensure that the induction
53940b57cec5SDimitry Andric   // variable of the vector loop wraps to zero, when tail is folded by masking;
53950b57cec5SDimitry Andric   // this currently happens when OptForSize, in which case IC is set to 1 above.
53968bcb0991SDimitry Andric   unsigned IC = UINT_MAX;
53970b57cec5SDimitry Andric 
53988bcb0991SDimitry Andric   for (auto& pair : R.MaxLocalUsers) {
53998bcb0991SDimitry Andric     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
54008bcb0991SDimitry Andric     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
54018bcb0991SDimitry Andric                       << " registers of "
54028bcb0991SDimitry Andric                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5403e8d8bef9SDimitry Andric     if (VF.isScalar()) {
54048bcb0991SDimitry Andric       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
54058bcb0991SDimitry Andric         TargetNumRegisters = ForceTargetNumScalarRegs;
54068bcb0991SDimitry Andric     } else {
54078bcb0991SDimitry Andric       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
54088bcb0991SDimitry Andric         TargetNumRegisters = ForceTargetNumVectorRegs;
54098bcb0991SDimitry Andric     }
54108bcb0991SDimitry Andric     unsigned MaxLocalUsers = pair.second;
54118bcb0991SDimitry Andric     unsigned LoopInvariantRegs = 0;
54128bcb0991SDimitry Andric     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
54138bcb0991SDimitry Andric       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
54148bcb0991SDimitry Andric 
5415fe013be4SDimitry Andric     unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /
5416fe013be4SDimitry Andric                                      MaxLocalUsers);
54170b57cec5SDimitry Andric     // Don't count the induction variable as interleaved.
54188bcb0991SDimitry Andric     if (EnableIndVarRegisterHeur) {
5419fe013be4SDimitry Andric       TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) /
54208bcb0991SDimitry Andric                               std::max(1U, (MaxLocalUsers - 1)));
54218bcb0991SDimitry Andric     }
54228bcb0991SDimitry Andric 
54238bcb0991SDimitry Andric     IC = std::min(IC, TmpIC);
54248bcb0991SDimitry Andric   }
54250b57cec5SDimitry Andric 
54260b57cec5SDimitry Andric   // Clamp the interleave ranges to reasonable counts.
5427fe013be4SDimitry Andric   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
54280b57cec5SDimitry Andric 
54290b57cec5SDimitry Andric   // Check if the user has overridden the max.
5430e8d8bef9SDimitry Andric   if (VF.isScalar()) {
54310b57cec5SDimitry Andric     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
54320b57cec5SDimitry Andric       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
54330b57cec5SDimitry Andric   } else {
54340b57cec5SDimitry Andric     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
54350b57cec5SDimitry Andric       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
54360b57cec5SDimitry Andric   }
54370b57cec5SDimitry Andric 
5438cdc20ff6SDimitry Andric   unsigned EstimatedVF = VF.getKnownMinValue();
5439cdc20ff6SDimitry Andric   if (VF.isScalable()) {
5440cdc20ff6SDimitry Andric     if (std::optional<unsigned> VScale = getVScaleForTuning(TheLoop, TTI))
5441cdc20ff6SDimitry Andric       EstimatedVF *= *VScale;
5442cdc20ff6SDimitry Andric   }
5443cdc20ff6SDimitry Andric   assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
5444cdc20ff6SDimitry Andric 
5445cdc20ff6SDimitry Andric   unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5446cdc20ff6SDimitry Andric   if (KnownTC) {
5447cdc20ff6SDimitry Andric     // If trip count is known we select between two prospective ICs, where
5448cdc20ff6SDimitry Andric     // 1) the aggressive IC is capped by the trip count divided by VF
5449cdc20ff6SDimitry Andric     // 2) the conservative IC is capped by the trip count divided by (VF * 2)
5450cdc20ff6SDimitry Andric     // The final IC is selected in a way that the epilogue loop trip count is
5451cdc20ff6SDimitry Andric     // minimized while maximizing the IC itself, so that we either run the
5452cdc20ff6SDimitry Andric     // vector loop at least once if it generates a small epilogue loop, or else
5453cdc20ff6SDimitry Andric     // we run the vector loop at least twice.
5454cdc20ff6SDimitry Andric 
5455cdc20ff6SDimitry Andric     unsigned InterleaveCountUB = bit_floor(
5456cdc20ff6SDimitry Andric         std::max(1u, std::min(KnownTC / EstimatedVF, MaxInterleaveCount)));
5457cdc20ff6SDimitry Andric     unsigned InterleaveCountLB = bit_floor(std::max(
5458cdc20ff6SDimitry Andric         1u, std::min(KnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
5459cdc20ff6SDimitry Andric     MaxInterleaveCount = InterleaveCountLB;
5460cdc20ff6SDimitry Andric 
5461cdc20ff6SDimitry Andric     if (InterleaveCountUB != InterleaveCountLB) {
5462cdc20ff6SDimitry Andric       unsigned TailTripCountUB = (KnownTC % (EstimatedVF * InterleaveCountUB));
5463cdc20ff6SDimitry Andric       unsigned TailTripCountLB = (KnownTC % (EstimatedVF * InterleaveCountLB));
5464cdc20ff6SDimitry Andric       // If both produce same scalar tail, maximize the IC to do the same work
5465cdc20ff6SDimitry Andric       // in fewer vector loop iterations
5466cdc20ff6SDimitry Andric       if (TailTripCountUB == TailTripCountLB)
5467cdc20ff6SDimitry Andric         MaxInterleaveCount = InterleaveCountUB;
5468cdc20ff6SDimitry Andric     }
5469cdc20ff6SDimitry Andric   } else if (BestKnownTC) {
5470cdc20ff6SDimitry Andric     // If trip count is an estimated compile time constant, limit the
5471cdc20ff6SDimitry Andric     // IC to be capped by the trip count divided by VF * 2, such that the vector
5472cdc20ff6SDimitry Andric     // loop runs at least twice to make interleaving seem profitable when there
5473cdc20ff6SDimitry Andric     // is an epilogue loop present. Since exact Trip count is not known we
5474cdc20ff6SDimitry Andric     // choose to be conservative in our IC estimate.
5475cdc20ff6SDimitry Andric     MaxInterleaveCount = bit_floor(std::max(
5476cdc20ff6SDimitry Andric         1u, std::min(*BestKnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
54778bcb0991SDimitry Andric   }
54788bcb0991SDimitry Andric 
5479e8d8bef9SDimitry Andric   assert(MaxInterleaveCount > 0 &&
5480e8d8bef9SDimitry Andric          "Maximum interleave count must be greater than 0");
54810b57cec5SDimitry Andric 
54820b57cec5SDimitry Andric   // Clamp the calculated IC to be between the 1 and the max interleave count
54838bcb0991SDimitry Andric   // that the target and trip count allows.
54840b57cec5SDimitry Andric   if (IC > MaxInterleaveCount)
54850b57cec5SDimitry Andric     IC = MaxInterleaveCount;
5486e8d8bef9SDimitry Andric   else
5487e8d8bef9SDimitry Andric     // Make sure IC is greater than 0.
5488e8d8bef9SDimitry Andric     IC = std::max(1u, IC);
5489e8d8bef9SDimitry Andric 
5490e8d8bef9SDimitry Andric   assert(IC > 0 && "Interleave count must be greater than 0.");
5491e8d8bef9SDimitry Andric 
54920b57cec5SDimitry Andric   // Interleave if we vectorized this loop and there is a reduction that could
54930b57cec5SDimitry Andric   // benefit from interleaving.
5494e8d8bef9SDimitry Andric   if (VF.isVector() && HasReductions) {
54950b57cec5SDimitry Andric     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
54960b57cec5SDimitry Andric     return IC;
54970b57cec5SDimitry Andric   }
54980b57cec5SDimitry Andric 
549981ad6265SDimitry Andric   // For any scalar loop that either requires runtime checks or predication we
550081ad6265SDimitry Andric   // are better off leaving this to the unroller. Note that if we've already
550181ad6265SDimitry Andric   // vectorized the loop we will have done the runtime check and so interleaving
550281ad6265SDimitry Andric   // won't require further checks.
550381ad6265SDimitry Andric   bool ScalarInterleavingRequiresPredication =
550481ad6265SDimitry Andric       (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
550581ad6265SDimitry Andric          return Legal->blockNeedsPredication(BB);
550681ad6265SDimitry Andric        }));
550781ad6265SDimitry Andric   bool ScalarInterleavingRequiresRuntimePointerCheck =
5508e8d8bef9SDimitry Andric       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
55090b57cec5SDimitry Andric 
55100b57cec5SDimitry Andric   // We want to interleave small loops in order to reduce the loop overhead and
55110b57cec5SDimitry Andric   // potentially expose ILP opportunities.
5512e8d8bef9SDimitry Andric   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5513e8d8bef9SDimitry Andric                     << "LV: IC is " << IC << '\n'
5514e8d8bef9SDimitry Andric                     << "LV: VF is " << VF << '\n');
5515e8d8bef9SDimitry Andric   const bool AggressivelyInterleaveReductions =
5516e8d8bef9SDimitry Andric       TTI.enableAggressiveInterleaving(HasReductions);
551781ad6265SDimitry Andric   if (!ScalarInterleavingRequiresRuntimePointerCheck &&
551881ad6265SDimitry Andric       !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
55190b57cec5SDimitry Andric     // We assume that the cost overhead is 1 and we use the cost model
55200b57cec5SDimitry Andric     // to estimate the cost of the loop and interleave until the cost of the
55210b57cec5SDimitry Andric     // loop overhead is about 5% of the cost of the loop.
5522fe013be4SDimitry Andric     unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>(
5523fe013be4SDimitry Andric                                         SmallLoopCost / *LoopCost.getValue()));
55240b57cec5SDimitry Andric 
55250b57cec5SDimitry Andric     // Interleave until store/load ports (estimated by max interleave count) are
55260b57cec5SDimitry Andric     // saturated.
55270b57cec5SDimitry Andric     unsigned NumStores = Legal->getNumStores();
55280b57cec5SDimitry Andric     unsigned NumLoads = Legal->getNumLoads();
55290b57cec5SDimitry Andric     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
55300b57cec5SDimitry Andric     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
55310b57cec5SDimitry Andric 
5532349cc55cSDimitry Andric     // There is little point in interleaving for reductions containing selects
5533349cc55cSDimitry Andric     // and compares when VF=1 since it may just create more overhead than it's
5534349cc55cSDimitry Andric     // worth for loops with small trip counts. This is because we still have to
5535349cc55cSDimitry Andric     // do the final reduction after the loop.
5536349cc55cSDimitry Andric     bool HasSelectCmpReductions =
5537349cc55cSDimitry Andric         HasReductions &&
5538349cc55cSDimitry Andric         any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5539349cc55cSDimitry Andric           const RecurrenceDescriptor &RdxDesc = Reduction.second;
5540c9157d92SDimitry Andric           return RecurrenceDescriptor::isAnyOfRecurrenceKind(
5541349cc55cSDimitry Andric               RdxDesc.getRecurrenceKind());
5542349cc55cSDimitry Andric         });
5543349cc55cSDimitry Andric     if (HasSelectCmpReductions) {
5544349cc55cSDimitry Andric       LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5545349cc55cSDimitry Andric       return 1;
5546349cc55cSDimitry Andric     }
5547349cc55cSDimitry Andric 
55480b57cec5SDimitry Andric     // If we have a scalar reduction (vector reductions are already dealt with
55490b57cec5SDimitry Andric     // by this point), we can increase the critical path length if the loop
5550fe6060f1SDimitry Andric     // we're interleaving is inside another loop. For tree-wise reductions
5551fe6060f1SDimitry Andric     // set the limit to 2, and for ordered reductions it's best to disable
5552fe6060f1SDimitry Andric     // interleaving entirely.
5553e8d8bef9SDimitry Andric     if (HasReductions && TheLoop->getLoopDepth() > 1) {
5554fe6060f1SDimitry Andric       bool HasOrderedReductions =
5555fe6060f1SDimitry Andric           any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5556fe6060f1SDimitry Andric             const RecurrenceDescriptor &RdxDesc = Reduction.second;
5557fe6060f1SDimitry Andric             return RdxDesc.isOrdered();
5558fe6060f1SDimitry Andric           });
5559fe6060f1SDimitry Andric       if (HasOrderedReductions) {
5560fe6060f1SDimitry Andric         LLVM_DEBUG(
5561fe6060f1SDimitry Andric             dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5562fe6060f1SDimitry Andric         return 1;
5563fe6060f1SDimitry Andric       }
5564fe6060f1SDimitry Andric 
55650b57cec5SDimitry Andric       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
55660b57cec5SDimitry Andric       SmallIC = std::min(SmallIC, F);
55670b57cec5SDimitry Andric       StoresIC = std::min(StoresIC, F);
55680b57cec5SDimitry Andric       LoadsIC = std::min(LoadsIC, F);
55690b57cec5SDimitry Andric     }
55700b57cec5SDimitry Andric 
55710b57cec5SDimitry Andric     if (EnableLoadStoreRuntimeInterleave &&
55720b57cec5SDimitry Andric         std::max(StoresIC, LoadsIC) > SmallIC) {
55730b57cec5SDimitry Andric       LLVM_DEBUG(
55740b57cec5SDimitry Andric           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
55750b57cec5SDimitry Andric       return std::max(StoresIC, LoadsIC);
55760b57cec5SDimitry Andric     }
55770b57cec5SDimitry Andric 
5578e8d8bef9SDimitry Andric     // If there are scalar reductions and TTI has enabled aggressive
5579e8d8bef9SDimitry Andric     // interleaving for reductions, we will interleave to expose ILP.
5580e8d8bef9SDimitry Andric     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
5581e8d8bef9SDimitry Andric         AggressivelyInterleaveReductions) {
5582e8d8bef9SDimitry Andric       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5583e8d8bef9SDimitry Andric       // Interleave no less than SmallIC but not as aggressive as the normal IC
5584e8d8bef9SDimitry Andric       // to satisfy the rare situation when resources are too limited.
5585e8d8bef9SDimitry Andric       return std::max(IC / 2, SmallIC);
5586e8d8bef9SDimitry Andric     } else {
55870b57cec5SDimitry Andric       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
55880b57cec5SDimitry Andric       return SmallIC;
55890b57cec5SDimitry Andric     }
5590e8d8bef9SDimitry Andric   }
55910b57cec5SDimitry Andric 
55920b57cec5SDimitry Andric   // Interleave if this is a large loop (small loops are already dealt with by
55930b57cec5SDimitry Andric   // this point) that could benefit from interleaving.
5594e8d8bef9SDimitry Andric   if (AggressivelyInterleaveReductions) {
55950b57cec5SDimitry Andric     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
55960b57cec5SDimitry Andric     return IC;
55970b57cec5SDimitry Andric   }
55980b57cec5SDimitry Andric 
55990b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
56000b57cec5SDimitry Andric   return 1;
56010b57cec5SDimitry Andric }
56020b57cec5SDimitry Andric 
56030b57cec5SDimitry Andric SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
calculateRegisterUsage(ArrayRef<ElementCount> VFs)5604e8d8bef9SDimitry Andric LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
56050b57cec5SDimitry Andric   // This function calculates the register usage by measuring the highest number
56060b57cec5SDimitry Andric   // of values that are alive at a single location. Obviously, this is a very
56070b57cec5SDimitry Andric   // rough estimation. We scan the loop in a topological order in order and
56080b57cec5SDimitry Andric   // assign a number to each instruction. We use RPO to ensure that defs are
56090b57cec5SDimitry Andric   // met before their users. We assume that each instruction that has in-loop
56100b57cec5SDimitry Andric   // users starts an interval. We record every time that an in-loop value is
56110b57cec5SDimitry Andric   // used, so we have a list of the first and last occurrences of each
56120b57cec5SDimitry Andric   // instruction. Next, we transpose this data structure into a multi map that
56130b57cec5SDimitry Andric   // holds the list of intervals that *end* at a specific location. This multi
56140b57cec5SDimitry Andric   // map allows us to perform a linear search. We scan the instructions linearly
56150b57cec5SDimitry Andric   // and record each time that a new interval starts, by placing it in a set.
56160b57cec5SDimitry Andric   // If we find this value in the multi-map then we remove it from the set.
56170b57cec5SDimitry Andric   // The max register usage is the maximum size of the set.
56180b57cec5SDimitry Andric   // We also search for instructions that are defined outside the loop, but are
56190b57cec5SDimitry Andric   // used inside the loop. We need this number separately from the max-interval
56200b57cec5SDimitry Andric   // usage number because when we unroll, loop-invariant values do not take
56210b57cec5SDimitry Andric   // more register.
56220b57cec5SDimitry Andric   LoopBlocksDFS DFS(TheLoop);
56230b57cec5SDimitry Andric   DFS.perform(LI);
56240b57cec5SDimitry Andric 
56250b57cec5SDimitry Andric   RegisterUsage RU;
56260b57cec5SDimitry Andric 
56270b57cec5SDimitry Andric   // Each 'key' in the map opens a new interval. The values
56280b57cec5SDimitry Andric   // of the map are the index of the 'last seen' usage of the
56290b57cec5SDimitry Andric   // instruction that is the key.
56300b57cec5SDimitry Andric   using IntervalMap = DenseMap<Instruction *, unsigned>;
56310b57cec5SDimitry Andric 
56320b57cec5SDimitry Andric   // Maps instruction to its index.
56330b57cec5SDimitry Andric   SmallVector<Instruction *, 64> IdxToInstr;
56340b57cec5SDimitry Andric   // Marks the end of each interval.
56350b57cec5SDimitry Andric   IntervalMap EndPoint;
56360b57cec5SDimitry Andric   // Saves the list of instruction indices that are used in the loop.
56370b57cec5SDimitry Andric   SmallPtrSet<Instruction *, 8> Ends;
5638bdd1243dSDimitry Andric   // Saves the list of values that are used in the loop but are defined outside
5639bdd1243dSDimitry Andric   // the loop (not including non-instruction values such as arguments and
5640bdd1243dSDimitry Andric   // constants).
5641fe013be4SDimitry Andric   SmallSetVector<Instruction *, 8> LoopInvariants;
56420b57cec5SDimitry Andric 
56430b57cec5SDimitry Andric   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
56440b57cec5SDimitry Andric     for (Instruction &I : BB->instructionsWithoutDebug()) {
56450b57cec5SDimitry Andric       IdxToInstr.push_back(&I);
56460b57cec5SDimitry Andric 
56470b57cec5SDimitry Andric       // Save the end location of each USE.
56480b57cec5SDimitry Andric       for (Value *U : I.operands()) {
56490b57cec5SDimitry Andric         auto *Instr = dyn_cast<Instruction>(U);
56500b57cec5SDimitry Andric 
56510b57cec5SDimitry Andric         // Ignore non-instruction values such as arguments, constants, etc.
5652bdd1243dSDimitry Andric         // FIXME: Might need some motivation why these values are ignored. If
5653bdd1243dSDimitry Andric         // for example an argument is used inside the loop it will increase the
5654bdd1243dSDimitry Andric         // register pressure (so shouldn't we add it to LoopInvariants).
56550b57cec5SDimitry Andric         if (!Instr)
56560b57cec5SDimitry Andric           continue;
56570b57cec5SDimitry Andric 
56580b57cec5SDimitry Andric         // If this instruction is outside the loop then record it and continue.
56590b57cec5SDimitry Andric         if (!TheLoop->contains(Instr)) {
56600b57cec5SDimitry Andric           LoopInvariants.insert(Instr);
56610b57cec5SDimitry Andric           continue;
56620b57cec5SDimitry Andric         }
56630b57cec5SDimitry Andric 
56640b57cec5SDimitry Andric         // Overwrite previous end points.
56650b57cec5SDimitry Andric         EndPoint[Instr] = IdxToInstr.size();
56660b57cec5SDimitry Andric         Ends.insert(Instr);
56670b57cec5SDimitry Andric       }
56680b57cec5SDimitry Andric     }
56690b57cec5SDimitry Andric   }
56700b57cec5SDimitry Andric 
56710b57cec5SDimitry Andric   // Saves the list of intervals that end with the index in 'key'.
56720b57cec5SDimitry Andric   using InstrList = SmallVector<Instruction *, 2>;
56730b57cec5SDimitry Andric   DenseMap<unsigned, InstrList> TransposeEnds;
56740b57cec5SDimitry Andric 
56750b57cec5SDimitry Andric   // Transpose the EndPoints to a list of values that end at each index.
56760b57cec5SDimitry Andric   for (auto &Interval : EndPoint)
56770b57cec5SDimitry Andric     TransposeEnds[Interval.second].push_back(Interval.first);
56780b57cec5SDimitry Andric 
56790b57cec5SDimitry Andric   SmallPtrSet<Instruction *, 8> OpenIntervals;
56800b57cec5SDimitry Andric   SmallVector<RegisterUsage, 8> RUs(VFs.size());
56818bcb0991SDimitry Andric   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
56820b57cec5SDimitry Andric 
56830b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
56840b57cec5SDimitry Andric 
5685753f127fSDimitry Andric   const auto &TTICapture = TTI;
5686753f127fSDimitry Andric   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
5687e8d8bef9SDimitry Andric     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
5688fe6060f1SDimitry Andric       return 0;
5689753f127fSDimitry Andric     return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
56900b57cec5SDimitry Andric   };
56910b57cec5SDimitry Andric 
56920b57cec5SDimitry Andric   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
56930b57cec5SDimitry Andric     Instruction *I = IdxToInstr[i];
56940b57cec5SDimitry Andric 
56950b57cec5SDimitry Andric     // Remove all of the instructions that end at this location.
56960b57cec5SDimitry Andric     InstrList &List = TransposeEnds[i];
56970b57cec5SDimitry Andric     for (Instruction *ToRemove : List)
56980b57cec5SDimitry Andric       OpenIntervals.erase(ToRemove);
56990b57cec5SDimitry Andric 
57000b57cec5SDimitry Andric     // Ignore instructions that are never used within the loop.
57015ffd83dbSDimitry Andric     if (!Ends.count(I))
57020b57cec5SDimitry Andric       continue;
57030b57cec5SDimitry Andric 
57040b57cec5SDimitry Andric     // Skip ignored values.
57055ffd83dbSDimitry Andric     if (ValuesToIgnore.count(I))
57060b57cec5SDimitry Andric       continue;
57070b57cec5SDimitry Andric 
5708c9157d92SDimitry Andric     collectInLoopReductions();
5709c9157d92SDimitry Andric 
57100b57cec5SDimitry Andric     // For each VF find the maximum usage of registers.
57110b57cec5SDimitry Andric     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5712bdd1243dSDimitry Andric       // Count the number of registers used, per register class, given all open
5713bdd1243dSDimitry Andric       // intervals.
5714bdd1243dSDimitry Andric       // Note that elements in this SmallMapVector will be default constructed
5715bdd1243dSDimitry Andric       // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
5716bdd1243dSDimitry Andric       // there is no previous entry for ClassID.
57178bcb0991SDimitry Andric       SmallMapVector<unsigned, unsigned, 4> RegUsage;
57188bcb0991SDimitry Andric 
5719e8d8bef9SDimitry Andric       if (VFs[j].isScalar()) {
5720bdd1243dSDimitry Andric         for (auto *Inst : OpenIntervals) {
5721bdd1243dSDimitry Andric           unsigned ClassID =
5722bdd1243dSDimitry Andric               TTI.getRegisterClassForType(false, Inst->getType());
5723bdd1243dSDimitry Andric           // FIXME: The target might use more than one register for the type
5724bdd1243dSDimitry Andric           // even in the scalar case.
57258bcb0991SDimitry Andric           RegUsage[ClassID] += 1;
57268bcb0991SDimitry Andric         }
57278bcb0991SDimitry Andric       } else {
57288bcb0991SDimitry Andric         collectUniformsAndScalars(VFs[j]);
5729bdd1243dSDimitry Andric         for (auto *Inst : OpenIntervals) {
57300b57cec5SDimitry Andric           // Skip ignored values for VF > 1.
57315ffd83dbSDimitry Andric           if (VecValuesToIgnore.count(Inst))
57320b57cec5SDimitry Andric             continue;
57338bcb0991SDimitry Andric           if (isScalarAfterVectorization(Inst, VFs[j])) {
5734bdd1243dSDimitry Andric             unsigned ClassID =
5735bdd1243dSDimitry Andric                 TTI.getRegisterClassForType(false, Inst->getType());
5736bdd1243dSDimitry Andric             // FIXME: The target might use more than one register for the type
5737bdd1243dSDimitry Andric             // even in the scalar case.
57388bcb0991SDimitry Andric             RegUsage[ClassID] += 1;
57398bcb0991SDimitry Andric           } else {
5740bdd1243dSDimitry Andric             unsigned ClassID =
5741bdd1243dSDimitry Andric                 TTI.getRegisterClassForType(true, Inst->getType());
57428bcb0991SDimitry Andric             RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
57430b57cec5SDimitry Andric           }
57448bcb0991SDimitry Andric         }
57458bcb0991SDimitry Andric       }
57468bcb0991SDimitry Andric 
57478bcb0991SDimitry Andric       for (auto& pair : RegUsage) {
5748bdd1243dSDimitry Andric         auto &Entry = MaxUsages[j][pair.first];
5749bdd1243dSDimitry Andric         Entry = std::max(Entry, pair.second);
57508bcb0991SDimitry Andric       }
57510b57cec5SDimitry Andric     }
57520b57cec5SDimitry Andric 
57530b57cec5SDimitry Andric     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
57540b57cec5SDimitry Andric                       << OpenIntervals.size() << '\n');
57550b57cec5SDimitry Andric 
57560b57cec5SDimitry Andric     // Add the current instruction to the list of open intervals.
57570b57cec5SDimitry Andric     OpenIntervals.insert(I);
57580b57cec5SDimitry Andric   }
57590b57cec5SDimitry Andric 
57600b57cec5SDimitry Andric   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5761bdd1243dSDimitry Andric     // Note that elements in this SmallMapVector will be default constructed
5762bdd1243dSDimitry Andric     // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5763bdd1243dSDimitry Andric     // there is no previous entry for ClassID.
57648bcb0991SDimitry Andric     SmallMapVector<unsigned, unsigned, 4> Invariant;
57658bcb0991SDimitry Andric 
5766bdd1243dSDimitry Andric     for (auto *Inst : LoopInvariants) {
5767bdd1243dSDimitry Andric       // FIXME: The target might use more than one register for the type
5768bdd1243dSDimitry Andric       // even in the scalar case.
57691ac55f4cSDimitry Andric       bool IsScalar = all_of(Inst->users(), [&](User *U) {
57701ac55f4cSDimitry Andric         auto *I = cast<Instruction>(U);
57711ac55f4cSDimitry Andric         return TheLoop != LI->getLoopFor(I->getParent()) ||
57721ac55f4cSDimitry Andric                isScalarAfterVectorization(I, VFs[i]);
57731ac55f4cSDimitry Andric       });
57741ac55f4cSDimitry Andric 
57751ac55f4cSDimitry Andric       ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[i];
5776e8d8bef9SDimitry Andric       unsigned ClassID =
57771ac55f4cSDimitry Andric           TTI.getRegisterClassForType(VF.isVector(), Inst->getType());
57781ac55f4cSDimitry Andric       Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);
57790b57cec5SDimitry Andric     }
57800b57cec5SDimitry Andric 
57818bcb0991SDimitry Andric     LLVM_DEBUG({
57828bcb0991SDimitry Andric       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
57838bcb0991SDimitry Andric       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
57848bcb0991SDimitry Andric              << " item\n";
57858bcb0991SDimitry Andric       for (const auto &pair : MaxUsages[i]) {
57868bcb0991SDimitry Andric         dbgs() << "LV(REG): RegisterClass: "
57878bcb0991SDimitry Andric                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
57888bcb0991SDimitry Andric                << " registers\n";
57898bcb0991SDimitry Andric       }
57908bcb0991SDimitry Andric       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
57918bcb0991SDimitry Andric              << " item\n";
57928bcb0991SDimitry Andric       for (const auto &pair : Invariant) {
57938bcb0991SDimitry Andric         dbgs() << "LV(REG): RegisterClass: "
57948bcb0991SDimitry Andric                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
57958bcb0991SDimitry Andric                << " registers\n";
57968bcb0991SDimitry Andric       }
57978bcb0991SDimitry Andric     });
57980b57cec5SDimitry Andric 
57990b57cec5SDimitry Andric     RU.LoopInvariantRegs = Invariant;
58000b57cec5SDimitry Andric     RU.MaxLocalUsers = MaxUsages[i];
58010b57cec5SDimitry Andric     RUs[i] = RU;
58020b57cec5SDimitry Andric   }
58030b57cec5SDimitry Andric 
58040b57cec5SDimitry Andric   return RUs;
58050b57cec5SDimitry Andric }
58060b57cec5SDimitry Andric 
useEmulatedMaskMemRefHack(Instruction * I,ElementCount VF)580704eeddc0SDimitry Andric bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
580804eeddc0SDimitry Andric                                                            ElementCount VF) {
58090b57cec5SDimitry Andric   // TODO: Cost model for emulated masked load/store is completely
58100b57cec5SDimitry Andric   // broken. This hack guides the cost model to use an artificially
58110b57cec5SDimitry Andric   // high enough value to practically disable vectorization with such
58120b57cec5SDimitry Andric   // operations, except where previously deployed legality hack allowed
58130b57cec5SDimitry Andric   // using very low cost values. This is to avoid regressions coming simply
58140b57cec5SDimitry Andric   // from moving "masked load/store" check from legality to cost model.
58150b57cec5SDimitry Andric   // Masked Load/Gather emulation was previously never allowed.
58160b57cec5SDimitry Andric   // Limited number of Masked Store/Scatter emulation was allowed.
5817bdd1243dSDimitry Andric   assert((isPredicatedInst(I)) &&
5818fcaf7f86SDimitry Andric          "Expecting a scalar emulated instruction");
58190b57cec5SDimitry Andric   return isa<LoadInst>(I) ||
58200b57cec5SDimitry Andric          (isa<StoreInst>(I) &&
58210b57cec5SDimitry Andric           NumPredStores > NumberOfStoresToPredicate);
58220b57cec5SDimitry Andric }
58230b57cec5SDimitry Andric 
collectInstsToScalarize(ElementCount VF)5824e8d8bef9SDimitry Andric void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
58250b57cec5SDimitry Andric   // If we aren't vectorizing the loop, or if we've already collected the
58260b57cec5SDimitry Andric   // instructions to scalarize, there's nothing to do. Collection may already
58270b57cec5SDimitry Andric   // have occurred if we have a user-selected VF and are now computing the
58280b57cec5SDimitry Andric   // expected cost for interleaving.
5829fe013be4SDimitry Andric   if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF))
58300b57cec5SDimitry Andric     return;
58310b57cec5SDimitry Andric 
58320b57cec5SDimitry Andric   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
58330b57cec5SDimitry Andric   // not profitable to scalarize any instructions, the presence of VF in the
58340b57cec5SDimitry Andric   // map will indicate that we've analyzed it already.
58350b57cec5SDimitry Andric   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
58360b57cec5SDimitry Andric 
5837753f127fSDimitry Andric   PredicatedBBsAfterVectorization[VF].clear();
5838753f127fSDimitry Andric 
58390b57cec5SDimitry Andric   // Find all the instructions that are scalar with predication in the loop and
58400b57cec5SDimitry Andric   // determine if it would be better to not if-convert the blocks they are in.
58410b57cec5SDimitry Andric   // If so, we also record the instructions to scalarize.
58420b57cec5SDimitry Andric   for (BasicBlock *BB : TheLoop->blocks()) {
5843349cc55cSDimitry Andric     if (!blockNeedsPredicationForAnyReason(BB))
58440b57cec5SDimitry Andric       continue;
58450b57cec5SDimitry Andric     for (Instruction &I : *BB)
584604eeddc0SDimitry Andric       if (isScalarWithPredication(&I, VF)) {
58470b57cec5SDimitry Andric         ScalarCostsTy ScalarCosts;
5848fe6060f1SDimitry Andric         // Do not apply discount if scalable, because that would lead to
5849fe6060f1SDimitry Andric         // invalid scalarization costs.
58500b57cec5SDimitry Andric         // Do not apply discount logic if hacked cost is needed
58510b57cec5SDimitry Andric         // for emulated masked memrefs.
585204eeddc0SDimitry Andric         if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) &&
58530b57cec5SDimitry Andric             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
58540b57cec5SDimitry Andric           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
58550b57cec5SDimitry Andric         // Remember that BB will remain after vectorization.
5856753f127fSDimitry Andric         PredicatedBBsAfterVectorization[VF].insert(BB);
58570b57cec5SDimitry Andric       }
58580b57cec5SDimitry Andric   }
58590b57cec5SDimitry Andric }
58600b57cec5SDimitry Andric 
computePredInstDiscount(Instruction * PredInst,ScalarCostsTy & ScalarCosts,ElementCount VF)5861bdd1243dSDimitry Andric InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
5862e8d8bef9SDimitry Andric     Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
58630b57cec5SDimitry Andric   assert(!isUniformAfterVectorization(PredInst, VF) &&
58640b57cec5SDimitry Andric          "Instruction marked uniform-after-vectorization will be predicated");
58650b57cec5SDimitry Andric 
58660b57cec5SDimitry Andric   // Initialize the discount to zero, meaning that the scalar version and the
58670b57cec5SDimitry Andric   // vector version cost the same.
5868e8d8bef9SDimitry Andric   InstructionCost Discount = 0;
58690b57cec5SDimitry Andric 
58700b57cec5SDimitry Andric   // Holds instructions to analyze. The instructions we visit are mapped in
58710b57cec5SDimitry Andric   // ScalarCosts. Those instructions are the ones that would be scalarized if
58720b57cec5SDimitry Andric   // we find that the scalar version costs less.
58730b57cec5SDimitry Andric   SmallVector<Instruction *, 8> Worklist;
58740b57cec5SDimitry Andric 
58750b57cec5SDimitry Andric   // Returns true if the given instruction can be scalarized.
58760b57cec5SDimitry Andric   auto canBeScalarized = [&](Instruction *I) -> bool {
58770b57cec5SDimitry Andric     // We only attempt to scalarize instructions forming a single-use chain
58780b57cec5SDimitry Andric     // from the original predicated block that would otherwise be vectorized.
58790b57cec5SDimitry Andric     // Although not strictly necessary, we give up on instructions we know will
58800b57cec5SDimitry Andric     // already be scalar to avoid traversing chains that are unlikely to be
58810b57cec5SDimitry Andric     // beneficial.
58820b57cec5SDimitry Andric     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
58830b57cec5SDimitry Andric         isScalarAfterVectorization(I, VF))
58840b57cec5SDimitry Andric       return false;
58850b57cec5SDimitry Andric 
58860b57cec5SDimitry Andric     // If the instruction is scalar with predication, it will be analyzed
58870b57cec5SDimitry Andric     // separately. We ignore it within the context of PredInst.
588804eeddc0SDimitry Andric     if (isScalarWithPredication(I, VF))
58890b57cec5SDimitry Andric       return false;
58900b57cec5SDimitry Andric 
58910b57cec5SDimitry Andric     // If any of the instruction's operands are uniform after vectorization,
58920b57cec5SDimitry Andric     // the instruction cannot be scalarized. This prevents, for example, a
58930b57cec5SDimitry Andric     // masked load from being scalarized.
58940b57cec5SDimitry Andric     //
58950b57cec5SDimitry Andric     // We assume we will only emit a value for lane zero of an instruction
58960b57cec5SDimitry Andric     // marked uniform after vectorization, rather than VF identical values.
58970b57cec5SDimitry Andric     // Thus, if we scalarize an instruction that uses a uniform, we would
58980b57cec5SDimitry Andric     // create uses of values corresponding to the lanes we aren't emitting code
58990b57cec5SDimitry Andric     // for. This behavior can be changed by allowing getScalarValue to clone
59000b57cec5SDimitry Andric     // the lane zero values for uniforms rather than asserting.
59010b57cec5SDimitry Andric     for (Use &U : I->operands())
59020b57cec5SDimitry Andric       if (auto *J = dyn_cast<Instruction>(U.get()))
59030b57cec5SDimitry Andric         if (isUniformAfterVectorization(J, VF))
59040b57cec5SDimitry Andric           return false;
59050b57cec5SDimitry Andric 
59060b57cec5SDimitry Andric     // Otherwise, we can scalarize the instruction.
59070b57cec5SDimitry Andric     return true;
59080b57cec5SDimitry Andric   };
59090b57cec5SDimitry Andric 
59100b57cec5SDimitry Andric   // Compute the expected cost discount from scalarizing the entire expression
59110b57cec5SDimitry Andric   // feeding the predicated instruction. We currently only consider expressions
59120b57cec5SDimitry Andric   // that are single-use instruction chains.
59130b57cec5SDimitry Andric   Worklist.push_back(PredInst);
59140b57cec5SDimitry Andric   while (!Worklist.empty()) {
59150b57cec5SDimitry Andric     Instruction *I = Worklist.pop_back_val();
59160b57cec5SDimitry Andric 
59170b57cec5SDimitry Andric     // If we've already analyzed the instruction, there's nothing to do.
5918fe013be4SDimitry Andric     if (ScalarCosts.contains(I))
59190b57cec5SDimitry Andric       continue;
59200b57cec5SDimitry Andric 
59210b57cec5SDimitry Andric     // Compute the cost of the vector instruction. Note that this cost already
59220b57cec5SDimitry Andric     // includes the scalarization overhead of the predicated instruction.
5923e8d8bef9SDimitry Andric     InstructionCost VectorCost = getInstructionCost(I, VF).first;
59240b57cec5SDimitry Andric 
59250b57cec5SDimitry Andric     // Compute the cost of the scalarized instruction. This cost is the cost of
59260b57cec5SDimitry Andric     // the instruction as if it wasn't if-converted and instead remained in the
59270b57cec5SDimitry Andric     // predicated block. We will scale this cost by block probability after
59280b57cec5SDimitry Andric     // computing the scalarization overhead.
5929e8d8bef9SDimitry Andric     InstructionCost ScalarCost =
5930fe6060f1SDimitry Andric         VF.getFixedValue() *
5931e8d8bef9SDimitry Andric         getInstructionCost(I, ElementCount::getFixed(1)).first;
59320b57cec5SDimitry Andric 
59330b57cec5SDimitry Andric     // Compute the scalarization overhead of needed insertelement instructions
59340b57cec5SDimitry Andric     // and phi nodes.
5935bdd1243dSDimitry Andric     TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
593604eeddc0SDimitry Andric     if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
59375ffd83dbSDimitry Andric       ScalarCost += TTI.getScalarizationOverhead(
59385ffd83dbSDimitry Andric           cast<VectorType>(ToVectorTy(I->getType(), VF)),
5939bdd1243dSDimitry Andric           APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
5940bdd1243dSDimitry Andric           /*Extract*/ false, CostKind);
5941e8d8bef9SDimitry Andric       ScalarCost +=
5942bdd1243dSDimitry Andric           VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
59430b57cec5SDimitry Andric     }
59440b57cec5SDimitry Andric 
59450b57cec5SDimitry Andric     // Compute the scalarization overhead of needed extractelement
59460b57cec5SDimitry Andric     // instructions. For each of the instruction's operands, if the operand can
59470b57cec5SDimitry Andric     // be scalarized, add it to the worklist; otherwise, account for the
59480b57cec5SDimitry Andric     // overhead.
59490b57cec5SDimitry Andric     for (Use &U : I->operands())
59500b57cec5SDimitry Andric       if (auto *J = dyn_cast<Instruction>(U.get())) {
59510b57cec5SDimitry Andric         assert(VectorType::isValidElementType(J->getType()) &&
59520b57cec5SDimitry Andric                "Instruction has non-scalar type");
59530b57cec5SDimitry Andric         if (canBeScalarized(J))
59540b57cec5SDimitry Andric           Worklist.push_back(J);
5955e8d8bef9SDimitry Andric         else if (needsExtract(J, VF)) {
59560b57cec5SDimitry Andric           ScalarCost += TTI.getScalarizationOverhead(
59575ffd83dbSDimitry Andric               cast<VectorType>(ToVectorTy(J->getType(), VF)),
5958bdd1243dSDimitry Andric               APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
5959bdd1243dSDimitry Andric               /*Extract*/ true, CostKind);
5960e8d8bef9SDimitry Andric         }
59610b57cec5SDimitry Andric       }
59620b57cec5SDimitry Andric 
59630b57cec5SDimitry Andric     // Scale the total scalar cost by block probability.
59640b57cec5SDimitry Andric     ScalarCost /= getReciprocalPredBlockProb();
59650b57cec5SDimitry Andric 
59660b57cec5SDimitry Andric     // Compute the discount. A non-negative discount means the vector version
59670b57cec5SDimitry Andric     // of the instruction costs more, and scalarizing would be beneficial.
59680b57cec5SDimitry Andric     Discount += VectorCost - ScalarCost;
59690b57cec5SDimitry Andric     ScalarCosts[I] = ScalarCost;
59700b57cec5SDimitry Andric   }
59710b57cec5SDimitry Andric 
5972bdd1243dSDimitry Andric   return Discount;
59730b57cec5SDimitry Andric }
59740b57cec5SDimitry Andric 
59750b57cec5SDimitry Andric LoopVectorizationCostModel::VectorizationCostTy
expectedCost(ElementCount VF,SmallVectorImpl<InstructionVFPair> * Invalid)5976fe6060f1SDimitry Andric LoopVectorizationCostModel::expectedCost(
5977fe6060f1SDimitry Andric     ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
59780b57cec5SDimitry Andric   VectorizationCostTy Cost;
59790b57cec5SDimitry Andric 
59800b57cec5SDimitry Andric   // For each block.
59810b57cec5SDimitry Andric   for (BasicBlock *BB : TheLoop->blocks()) {
59820b57cec5SDimitry Andric     VectorizationCostTy BlockCost;
59830b57cec5SDimitry Andric 
59840b57cec5SDimitry Andric     // For each instruction in the old loop.
59850b57cec5SDimitry Andric     for (Instruction &I : BB->instructionsWithoutDebug()) {
59860b57cec5SDimitry Andric       // Skip ignored values.
5987e8d8bef9SDimitry Andric       if (ValuesToIgnore.count(&I) ||
5988e8d8bef9SDimitry Andric           (VF.isVector() && VecValuesToIgnore.count(&I)))
59890b57cec5SDimitry Andric         continue;
59900b57cec5SDimitry Andric 
59910b57cec5SDimitry Andric       VectorizationCostTy C = getInstructionCost(&I, VF);
59920b57cec5SDimitry Andric 
59930b57cec5SDimitry Andric       // Check if we should override the cost.
5994fe6060f1SDimitry Andric       if (C.first.isValid() &&
5995fe6060f1SDimitry Andric           ForceTargetInstructionCost.getNumOccurrences() > 0)
5996e8d8bef9SDimitry Andric         C.first = InstructionCost(ForceTargetInstructionCost);
59970b57cec5SDimitry Andric 
5998fe6060f1SDimitry Andric       // Keep a list of instructions with invalid costs.
5999fe6060f1SDimitry Andric       if (Invalid && !C.first.isValid())
6000fe6060f1SDimitry Andric         Invalid->emplace_back(&I, VF);
6001fe6060f1SDimitry Andric 
60020b57cec5SDimitry Andric       BlockCost.first += C.first;
60030b57cec5SDimitry Andric       BlockCost.second |= C.second;
60040b57cec5SDimitry Andric       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
60050b57cec5SDimitry Andric                         << " for VF " << VF << " For instruction: " << I
60060b57cec5SDimitry Andric                         << '\n');
60070b57cec5SDimitry Andric     }
60080b57cec5SDimitry Andric 
60090b57cec5SDimitry Andric     // If we are vectorizing a predicated block, it will have been
60100b57cec5SDimitry Andric     // if-converted. This means that the block's instructions (aside from
60110b57cec5SDimitry Andric     // stores and instructions that may divide by zero) will now be
60120b57cec5SDimitry Andric     // unconditionally executed. For the scalar case, we may not always execute
6013e8d8bef9SDimitry Andric     // the predicated block, if it is an if-else block. Thus, scale the block's
6014e8d8bef9SDimitry Andric     // cost by the probability of executing it. blockNeedsPredication from
6015e8d8bef9SDimitry Andric     // Legal is used so as to not include all blocks in tail folded loops.
6016e8d8bef9SDimitry Andric     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
60170b57cec5SDimitry Andric       BlockCost.first /= getReciprocalPredBlockProb();
60180b57cec5SDimitry Andric 
60190b57cec5SDimitry Andric     Cost.first += BlockCost.first;
60200b57cec5SDimitry Andric     Cost.second |= BlockCost.second;
60210b57cec5SDimitry Andric   }
60220b57cec5SDimitry Andric 
60230b57cec5SDimitry Andric   return Cost;
60240b57cec5SDimitry Andric }
60250b57cec5SDimitry Andric 
60260b57cec5SDimitry Andric /// Gets Address Access SCEV after verifying that the access pattern
60270b57cec5SDimitry Andric /// is loop invariant except the induction variable dependence.
60280b57cec5SDimitry Andric ///
60290b57cec5SDimitry Andric /// This SCEV can be sent to the Target in order to estimate the address
60300b57cec5SDimitry Andric /// calculation cost.
getAddressAccessSCEV(Value * Ptr,LoopVectorizationLegality * Legal,PredicatedScalarEvolution & PSE,const Loop * TheLoop)60310b57cec5SDimitry Andric static const SCEV *getAddressAccessSCEV(
60320b57cec5SDimitry Andric               Value *Ptr,
60330b57cec5SDimitry Andric               LoopVectorizationLegality *Legal,
60340b57cec5SDimitry Andric               PredicatedScalarEvolution &PSE,
60350b57cec5SDimitry Andric               const Loop *TheLoop) {
60360b57cec5SDimitry Andric 
60370b57cec5SDimitry Andric   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
60380b57cec5SDimitry Andric   if (!Gep)
60390b57cec5SDimitry Andric     return nullptr;
60400b57cec5SDimitry Andric 
60410b57cec5SDimitry Andric   // We are looking for a gep with all loop invariant indices except for one
60420b57cec5SDimitry Andric   // which should be an induction variable.
60430b57cec5SDimitry Andric   auto SE = PSE.getSE();
60440b57cec5SDimitry Andric   unsigned NumOperands = Gep->getNumOperands();
60450b57cec5SDimitry Andric   for (unsigned i = 1; i < NumOperands; ++i) {
60460b57cec5SDimitry Andric     Value *Opd = Gep->getOperand(i);
60470b57cec5SDimitry Andric     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
60480b57cec5SDimitry Andric         !Legal->isInductionVariable(Opd))
60490b57cec5SDimitry Andric       return nullptr;
60500b57cec5SDimitry Andric   }
60510b57cec5SDimitry Andric 
60520b57cec5SDimitry Andric   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
60530b57cec5SDimitry Andric   return PSE.getSCEV(Ptr);
60540b57cec5SDimitry Andric }
60550b57cec5SDimitry Andric 
6056e8d8bef9SDimitry Andric InstructionCost
getMemInstScalarizationCost(Instruction * I,ElementCount VF)6057e8d8bef9SDimitry Andric LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6058e8d8bef9SDimitry Andric                                                         ElementCount VF) {
6059e8d8bef9SDimitry Andric   assert(VF.isVector() &&
6060e8d8bef9SDimitry Andric          "Scalarization cost of instruction implies vectorization.");
6061fe6060f1SDimitry Andric   if (VF.isScalable())
6062fe6060f1SDimitry Andric     return InstructionCost::getInvalid();
6063fe6060f1SDimitry Andric 
6064fe6060f1SDimitry Andric   Type *ValTy = getLoadStoreType(I);
60650b57cec5SDimitry Andric   auto SE = PSE.getSE();
60660b57cec5SDimitry Andric 
60670b57cec5SDimitry Andric   unsigned AS = getLoadStoreAddressSpace(I);
60680b57cec5SDimitry Andric   Value *Ptr = getLoadStorePointerOperand(I);
60690b57cec5SDimitry Andric   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
60704824e7fdSDimitry Andric   // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
60714824e7fdSDimitry Andric   //       that it is being called from this specific place.
60720b57cec5SDimitry Andric 
60730b57cec5SDimitry Andric   // Figure out whether the access is strided and get the stride value
60740b57cec5SDimitry Andric   // if it's known in compile time
60750b57cec5SDimitry Andric   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
60760b57cec5SDimitry Andric 
60770b57cec5SDimitry Andric   // Get the cost of the scalar memory instruction and address computation.
6078e8d8bef9SDimitry Andric   InstructionCost Cost =
6079e8d8bef9SDimitry Andric       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
60800b57cec5SDimitry Andric 
60810b57cec5SDimitry Andric   // Don't pass *I here, since it is scalar but will actually be part of a
60820b57cec5SDimitry Andric   // vectorized loop where the user of it is a vectorized instruction.
6083bdd1243dSDimitry Andric   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
60845ffd83dbSDimitry Andric   const Align Alignment = getLoadStoreAlignment(I);
6085bdd1243dSDimitry Andric   Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
6086bdd1243dSDimitry Andric                                                       ValTy->getScalarType(),
6087bdd1243dSDimitry Andric                                                       Alignment, AS, CostKind);
60880b57cec5SDimitry Andric 
60890b57cec5SDimitry Andric   // Get the overhead of the extractelement and insertelement instructions
60900b57cec5SDimitry Andric   // we might create due to scalarization.
6091bdd1243dSDimitry Andric   Cost += getScalarizationOverhead(I, VF, CostKind);
60920b57cec5SDimitry Andric 
6093fe6060f1SDimitry Andric   // If we have a predicated load/store, it will need extra i1 extracts and
6094fe6060f1SDimitry Andric   // conditional branches, but may not be executed for each vector lane. Scale
6095fe6060f1SDimitry Andric   // the cost by the probability of executing the predicated block.
6096bdd1243dSDimitry Andric   if (isPredicatedInst(I)) {
60970b57cec5SDimitry Andric     Cost /= getReciprocalPredBlockProb();
60980b57cec5SDimitry Andric 
6099fe6060f1SDimitry Andric     // Add the cost of an i1 extract and a branch
6100fe6060f1SDimitry Andric     auto *Vec_i1Ty =
6101fe6060f1SDimitry Andric         VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
6102fe6060f1SDimitry Andric     Cost += TTI.getScalarizationOverhead(
6103349cc55cSDimitry Andric         Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
6104bdd1243dSDimitry Andric         /*Insert=*/false, /*Extract=*/true, CostKind);
6105bdd1243dSDimitry Andric     Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
6106fe6060f1SDimitry Andric 
610704eeddc0SDimitry Andric     if (useEmulatedMaskMemRefHack(I, VF))
61080b57cec5SDimitry Andric       // Artificially setting to a high enough value to practically disable
61090b57cec5SDimitry Andric       // vectorization with such operations.
61100b57cec5SDimitry Andric       Cost = 3000000;
61110b57cec5SDimitry Andric   }
61120b57cec5SDimitry Andric 
61130b57cec5SDimitry Andric   return Cost;
61140b57cec5SDimitry Andric }
61150b57cec5SDimitry Andric 
6116e8d8bef9SDimitry Andric InstructionCost
getConsecutiveMemOpCost(Instruction * I,ElementCount VF)6117e8d8bef9SDimitry Andric LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6118e8d8bef9SDimitry Andric                                                     ElementCount VF) {
6119fe6060f1SDimitry Andric   Type *ValTy = getLoadStoreType(I);
61205ffd83dbSDimitry Andric   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
61210b57cec5SDimitry Andric   Value *Ptr = getLoadStorePointerOperand(I);
61220b57cec5SDimitry Andric   unsigned AS = getLoadStoreAddressSpace(I);
6123349cc55cSDimitry Andric   int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
61245ffd83dbSDimitry Andric   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
61250b57cec5SDimitry Andric 
61260b57cec5SDimitry Andric   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
61270b57cec5SDimitry Andric          "Stride should be 1 or -1 for consecutive memory access");
61285ffd83dbSDimitry Andric   const Align Alignment = getLoadStoreAlignment(I);
6129e8d8bef9SDimitry Andric   InstructionCost Cost = 0;
6130bdd1243dSDimitry Andric   if (Legal->isMaskRequired(I)) {
61315ffd83dbSDimitry Andric     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
61325ffd83dbSDimitry Andric                                       CostKind);
6133bdd1243dSDimitry Andric   } else {
6134bdd1243dSDimitry Andric     TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
61355ffd83dbSDimitry Andric     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6136bdd1243dSDimitry Andric                                 CostKind, OpInfo, I);
6137bdd1243dSDimitry Andric   }
61380b57cec5SDimitry Andric 
61390b57cec5SDimitry Andric   bool Reverse = ConsecutiveStride < 0;
61400b57cec5SDimitry Andric   if (Reverse)
6141bdd1243dSDimitry Andric     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
6142bdd1243dSDimitry Andric                                std::nullopt, CostKind, 0);
61430b57cec5SDimitry Andric   return Cost;
61440b57cec5SDimitry Andric }
61450b57cec5SDimitry Andric 
6146e8d8bef9SDimitry Andric InstructionCost
getUniformMemOpCost(Instruction * I,ElementCount VF)6147e8d8bef9SDimitry Andric LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6148e8d8bef9SDimitry Andric                                                 ElementCount VF) {
6149fe013be4SDimitry Andric   assert(Legal->isUniformMemOp(*I, VF));
6150e8d8bef9SDimitry Andric 
6151fe6060f1SDimitry Andric   Type *ValTy = getLoadStoreType(I);
61525ffd83dbSDimitry Andric   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
61535ffd83dbSDimitry Andric   const Align Alignment = getLoadStoreAlignment(I);
61540b57cec5SDimitry Andric   unsigned AS = getLoadStoreAddressSpace(I);
61555ffd83dbSDimitry Andric   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
61560b57cec5SDimitry Andric   if (isa<LoadInst>(I)) {
61570b57cec5SDimitry Andric     return TTI.getAddressComputationCost(ValTy) +
61585ffd83dbSDimitry Andric            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
61595ffd83dbSDimitry Andric                                CostKind) +
61600b57cec5SDimitry Andric            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
61610b57cec5SDimitry Andric   }
61620b57cec5SDimitry Andric   StoreInst *SI = cast<StoreInst>(I);
61630b57cec5SDimitry Andric 
6164fe013be4SDimitry Andric   bool isLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
61650b57cec5SDimitry Andric   return TTI.getAddressComputationCost(ValTy) +
61665ffd83dbSDimitry Andric          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
61675ffd83dbSDimitry Andric                              CostKind) +
61688bcb0991SDimitry Andric          (isLoopInvariantStoreValue
61698bcb0991SDimitry Andric               ? 0
61708bcb0991SDimitry Andric               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6171bdd1243dSDimitry Andric                                        CostKind, VF.getKnownMinValue() - 1));
61720b57cec5SDimitry Andric }
61730b57cec5SDimitry Andric 
6174e8d8bef9SDimitry Andric InstructionCost
getGatherScatterCost(Instruction * I,ElementCount VF)6175e8d8bef9SDimitry Andric LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6176e8d8bef9SDimitry Andric                                                  ElementCount VF) {
6177fe6060f1SDimitry Andric   Type *ValTy = getLoadStoreType(I);
61785ffd83dbSDimitry Andric   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
61795ffd83dbSDimitry Andric   const Align Alignment = getLoadStoreAlignment(I);
61805ffd83dbSDimitry Andric   const Value *Ptr = getLoadStorePointerOperand(I);
61810b57cec5SDimitry Andric 
61820b57cec5SDimitry Andric   return TTI.getAddressComputationCost(VectorTy) +
61835ffd83dbSDimitry Andric          TTI.getGatherScatterOpCost(
61845ffd83dbSDimitry Andric              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
61855ffd83dbSDimitry Andric              TargetTransformInfo::TCK_RecipThroughput, I);
61860b57cec5SDimitry Andric }
61870b57cec5SDimitry Andric 
6188e8d8bef9SDimitry Andric InstructionCost
getInterleaveGroupCost(Instruction * I,ElementCount VF)6189e8d8bef9SDimitry Andric LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6190e8d8bef9SDimitry Andric                                                    ElementCount VF) {
6191fe6060f1SDimitry Andric   Type *ValTy = getLoadStoreType(I);
61925ffd83dbSDimitry Andric   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
61930b57cec5SDimitry Andric   unsigned AS = getLoadStoreAddressSpace(I);
6194bdd1243dSDimitry Andric   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
61950b57cec5SDimitry Andric 
61960b57cec5SDimitry Andric   auto Group = getInterleavedAccessGroup(I);
61970b57cec5SDimitry Andric   assert(Group && "Fail to get an interleaved access group.");
61980b57cec5SDimitry Andric 
61990b57cec5SDimitry Andric   unsigned InterleaveFactor = Group->getFactor();
6200e8d8bef9SDimitry Andric   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
62010b57cec5SDimitry Andric 
6202349cc55cSDimitry Andric   // Holds the indices of existing members in the interleaved group.
62030b57cec5SDimitry Andric   SmallVector<unsigned, 4> Indices;
6204349cc55cSDimitry Andric   for (unsigned IF = 0; IF < InterleaveFactor; IF++)
6205349cc55cSDimitry Andric     if (Group->getMember(IF))
6206349cc55cSDimitry Andric       Indices.push_back(IF);
62070b57cec5SDimitry Andric 
62080b57cec5SDimitry Andric   // Calculate the cost of the whole interleaved group.
62090b57cec5SDimitry Andric   bool UseMaskForGaps =
6210349cc55cSDimitry Andric       (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
6211349cc55cSDimitry Andric       (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
6212e8d8bef9SDimitry Andric   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
62135ffd83dbSDimitry Andric       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6214bdd1243dSDimitry Andric       AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps);
62150b57cec5SDimitry Andric 
62160b57cec5SDimitry Andric   if (Group->isReverse()) {
62170b57cec5SDimitry Andric     // TODO: Add support for reversed masked interleaved access.
62180b57cec5SDimitry Andric     assert(!Legal->isMaskRequired(I) &&
62190b57cec5SDimitry Andric            "Reverse masked interleaved access not supported.");
6220bdd1243dSDimitry Andric     Cost += Group->getNumMembers() *
6221bdd1243dSDimitry Andric             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
6222bdd1243dSDimitry Andric                                std::nullopt, CostKind, 0);
62230b57cec5SDimitry Andric   }
62240b57cec5SDimitry Andric   return Cost;
62250b57cec5SDimitry Andric }
62260b57cec5SDimitry Andric 
6227bdd1243dSDimitry Andric std::optional<InstructionCost>
getReductionPatternCost(Instruction * I,ElementCount VF,Type * Ty,TTI::TargetCostKind CostKind) const6228bdd1243dSDimitry Andric LoopVectorizationCostModel::getReductionPatternCost(
6229c9157d92SDimitry Andric     Instruction *I, ElementCount VF, Type *Ty,
6230c9157d92SDimitry Andric     TTI::TargetCostKind CostKind) const {
6231fe6060f1SDimitry Andric   using namespace llvm::PatternMatch;
6232e8d8bef9SDimitry Andric   // Early exit for no inloop reductions
6233c9157d92SDimitry Andric   if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
6234bdd1243dSDimitry Andric     return std::nullopt;
6235e8d8bef9SDimitry Andric   auto *VectorTy = cast<VectorType>(Ty);
6236e8d8bef9SDimitry Andric 
6237e8d8bef9SDimitry Andric   // We are looking for a pattern of, and finding the minimal acceptable cost:
6238e8d8bef9SDimitry Andric   //  reduce(mul(ext(A), ext(B))) or
6239e8d8bef9SDimitry Andric   //  reduce(mul(A, B)) or
6240e8d8bef9SDimitry Andric   //  reduce(ext(A)) or
6241e8d8bef9SDimitry Andric   //  reduce(A).
6242e8d8bef9SDimitry Andric   // The basic idea is that we walk down the tree to do that, finding the root
6243e8d8bef9SDimitry Andric   // reduction instruction in InLoopReductionImmediateChains. From there we find
6244e8d8bef9SDimitry Andric   // the pattern of mul/ext and test the cost of the entire pattern vs the cost
6245e8d8bef9SDimitry Andric   // of the components. If the reduction cost is lower then we return it for the
6246e8d8bef9SDimitry Andric   // reduction instruction and 0 for the other instructions in the pattern. If
6247e8d8bef9SDimitry Andric   // it is not we return an invalid cost specifying the orignal cost method
6248e8d8bef9SDimitry Andric   // should be used.
6249e8d8bef9SDimitry Andric   Instruction *RetI = I;
6250fe6060f1SDimitry Andric   if (match(RetI, m_ZExtOrSExt(m_Value()))) {
6251e8d8bef9SDimitry Andric     if (!RetI->hasOneUser())
6252bdd1243dSDimitry Andric       return std::nullopt;
6253e8d8bef9SDimitry Andric     RetI = RetI->user_back();
6254e8d8bef9SDimitry Andric   }
6255bdd1243dSDimitry Andric 
6256bdd1243dSDimitry Andric   if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
6257e8d8bef9SDimitry Andric       RetI->user_back()->getOpcode() == Instruction::Add) {
6258e8d8bef9SDimitry Andric     RetI = RetI->user_back();
6259e8d8bef9SDimitry Andric   }
6260e8d8bef9SDimitry Andric 
6261e8d8bef9SDimitry Andric   // Test if the found instruction is a reduction, and if not return an invalid
6262e8d8bef9SDimitry Andric   // cost specifying the parent to use the original cost modelling.
6263e8d8bef9SDimitry Andric   if (!InLoopReductionImmediateChains.count(RetI))
6264bdd1243dSDimitry Andric     return std::nullopt;
6265e8d8bef9SDimitry Andric 
6266e8d8bef9SDimitry Andric   // Find the reduction this chain is a part of and calculate the basic cost of
6267e8d8bef9SDimitry Andric   // the reduction on its own.
6268c9157d92SDimitry Andric   Instruction *LastChain = InLoopReductionImmediateChains.at(RetI);
6269e8d8bef9SDimitry Andric   Instruction *ReductionPhi = LastChain;
6270e8d8bef9SDimitry Andric   while (!isa<PHINode>(ReductionPhi))
6271c9157d92SDimitry Andric     ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi);
6272e8d8bef9SDimitry Andric 
6273fe6060f1SDimitry Andric   const RecurrenceDescriptor &RdxDesc =
62740eae32dcSDimitry Andric       Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
6275fe6060f1SDimitry Andric 
6276fe6060f1SDimitry Andric   InstructionCost BaseCost = TTI.getArithmeticReductionCost(
6277fe6060f1SDimitry Andric       RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
6278fe6060f1SDimitry Andric 
62794824e7fdSDimitry Andric   // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
62804824e7fdSDimitry Andric   // normal fmul instruction to the cost of the fadd reduction.
62814824e7fdSDimitry Andric   if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)
62824824e7fdSDimitry Andric     BaseCost +=
62834824e7fdSDimitry Andric         TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
62844824e7fdSDimitry Andric 
6285fe6060f1SDimitry Andric   // If we're using ordered reductions then we can just return the base cost
6286fe6060f1SDimitry Andric   // here, since getArithmeticReductionCost calculates the full ordered
6287fe6060f1SDimitry Andric   // reduction cost when FP reassociation is not allowed.
6288fe6060f1SDimitry Andric   if (useOrderedReductions(RdxDesc))
6289fe6060f1SDimitry Andric     return BaseCost;
6290e8d8bef9SDimitry Andric 
6291e8d8bef9SDimitry Andric   // Get the operand that was not the reduction chain and match it to one of the
6292e8d8bef9SDimitry Andric   // patterns, returning the better cost if it is found.
6293e8d8bef9SDimitry Andric   Instruction *RedOp = RetI->getOperand(1) == LastChain
6294e8d8bef9SDimitry Andric                            ? dyn_cast<Instruction>(RetI->getOperand(0))
6295e8d8bef9SDimitry Andric                            : dyn_cast<Instruction>(RetI->getOperand(1));
6296e8d8bef9SDimitry Andric 
6297e8d8bef9SDimitry Andric   VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
6298e8d8bef9SDimitry Andric 
6299fe6060f1SDimitry Andric   Instruction *Op0, *Op1;
6300bdd1243dSDimitry Andric   if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6301349cc55cSDimitry Andric       match(RedOp,
6302349cc55cSDimitry Andric             m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
6303349cc55cSDimitry Andric       match(Op0, m_ZExtOrSExt(m_Value())) &&
6304349cc55cSDimitry Andric       Op0->getOpcode() == Op1->getOpcode() &&
6305349cc55cSDimitry Andric       Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
6306349cc55cSDimitry Andric       !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
6307349cc55cSDimitry Andric       (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
6308349cc55cSDimitry Andric 
6309bdd1243dSDimitry Andric     // Matched reduce.add(ext(mul(ext(A), ext(B)))
6310349cc55cSDimitry Andric     // Note that the extend opcodes need to all match, or if A==B they will have
6311349cc55cSDimitry Andric     // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
6312349cc55cSDimitry Andric     // which is equally fine.
6313349cc55cSDimitry Andric     bool IsUnsigned = isa<ZExtInst>(Op0);
6314349cc55cSDimitry Andric     auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
6315349cc55cSDimitry Andric     auto *MulType = VectorType::get(Op0->getType(), VectorTy);
6316349cc55cSDimitry Andric 
6317349cc55cSDimitry Andric     InstructionCost ExtCost =
6318349cc55cSDimitry Andric         TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
6319349cc55cSDimitry Andric                              TTI::CastContextHint::None, CostKind, Op0);
6320349cc55cSDimitry Andric     InstructionCost MulCost =
6321349cc55cSDimitry Andric         TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
6322349cc55cSDimitry Andric     InstructionCost Ext2Cost =
6323349cc55cSDimitry Andric         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
6324349cc55cSDimitry Andric                              TTI::CastContextHint::None, CostKind, RedOp);
6325349cc55cSDimitry Andric 
6326bdd1243dSDimitry Andric     InstructionCost RedCost = TTI.getMulAccReductionCost(
6327bdd1243dSDimitry Andric         IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
6328349cc55cSDimitry Andric 
6329349cc55cSDimitry Andric     if (RedCost.isValid() &&
6330349cc55cSDimitry Andric         RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
6331349cc55cSDimitry Andric       return I == RetI ? RedCost : 0;
6332349cc55cSDimitry Andric   } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
6333e8d8bef9SDimitry Andric              !TheLoop->isLoopInvariant(RedOp)) {
6334fe6060f1SDimitry Andric     // Matched reduce(ext(A))
6335e8d8bef9SDimitry Andric     bool IsUnsigned = isa<ZExtInst>(RedOp);
6336e8d8bef9SDimitry Andric     auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
6337bdd1243dSDimitry Andric     InstructionCost RedCost = TTI.getExtendedReductionCost(
6338bdd1243dSDimitry Andric         RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6339bdd1243dSDimitry Andric         RdxDesc.getFastMathFlags(), CostKind);
6340e8d8bef9SDimitry Andric 
6341fe6060f1SDimitry Andric     InstructionCost ExtCost =
6342e8d8bef9SDimitry Andric         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
6343e8d8bef9SDimitry Andric                              TTI::CastContextHint::None, CostKind, RedOp);
6344e8d8bef9SDimitry Andric     if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
6345fe6060f1SDimitry Andric       return I == RetI ? RedCost : 0;
6346bdd1243dSDimitry Andric   } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6347fe6060f1SDimitry Andric              match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
6348fe6060f1SDimitry Andric     if (match(Op0, m_ZExtOrSExt(m_Value())) &&
6349e8d8bef9SDimitry Andric         Op0->getOpcode() == Op1->getOpcode() &&
6350e8d8bef9SDimitry Andric         !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
6351e8d8bef9SDimitry Andric       bool IsUnsigned = isa<ZExtInst>(Op0);
63520eae32dcSDimitry Andric       Type *Op0Ty = Op0->getOperand(0)->getType();
63530eae32dcSDimitry Andric       Type *Op1Ty = Op1->getOperand(0)->getType();
63540eae32dcSDimitry Andric       Type *LargestOpTy =
63550eae32dcSDimitry Andric           Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
63560eae32dcSDimitry Andric                                                                     : Op0Ty;
63570eae32dcSDimitry Andric       auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
63580eae32dcSDimitry Andric 
6359bdd1243dSDimitry Andric       // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
63600eae32dcSDimitry Andric       // different sizes. We take the largest type as the ext to reduce, and add
63610eae32dcSDimitry Andric       // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
63620eae32dcSDimitry Andric       InstructionCost ExtCost0 = TTI.getCastInstrCost(
63630eae32dcSDimitry Andric           Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
6364e8d8bef9SDimitry Andric           TTI::CastContextHint::None, CostKind, Op0);
63650eae32dcSDimitry Andric       InstructionCost ExtCost1 = TTI.getCastInstrCost(
63660eae32dcSDimitry Andric           Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
63670eae32dcSDimitry Andric           TTI::CastContextHint::None, CostKind, Op1);
6368fe6060f1SDimitry Andric       InstructionCost MulCost =
6369fe6060f1SDimitry Andric           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6370e8d8bef9SDimitry Andric 
6371bdd1243dSDimitry Andric       InstructionCost RedCost = TTI.getMulAccReductionCost(
6372bdd1243dSDimitry Andric           IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
63730eae32dcSDimitry Andric       InstructionCost ExtraExtCost = 0;
63740eae32dcSDimitry Andric       if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
63750eae32dcSDimitry Andric         Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
63760eae32dcSDimitry Andric         ExtraExtCost = TTI.getCastInstrCost(
63770eae32dcSDimitry Andric             ExtraExtOp->getOpcode(), ExtType,
63780eae32dcSDimitry Andric             VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
63790eae32dcSDimitry Andric             TTI::CastContextHint::None, CostKind, ExtraExtOp);
63800eae32dcSDimitry Andric       }
6381e8d8bef9SDimitry Andric 
63820eae32dcSDimitry Andric       if (RedCost.isValid() &&
63830eae32dcSDimitry Andric           (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
6384fe6060f1SDimitry Andric         return I == RetI ? RedCost : 0;
6385349cc55cSDimitry Andric     } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
6386bdd1243dSDimitry Andric       // Matched reduce.add(mul())
6387fe6060f1SDimitry Andric       InstructionCost MulCost =
6388fe6060f1SDimitry Andric           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6389e8d8bef9SDimitry Andric 
6390bdd1243dSDimitry Andric       InstructionCost RedCost = TTI.getMulAccReductionCost(
6391bdd1243dSDimitry Andric           true, RdxDesc.getRecurrenceType(), VectorTy, CostKind);
6392e8d8bef9SDimitry Andric 
6393e8d8bef9SDimitry Andric       if (RedCost.isValid() && RedCost < MulCost + BaseCost)
6394fe6060f1SDimitry Andric         return I == RetI ? RedCost : 0;
6395e8d8bef9SDimitry Andric     }
6396e8d8bef9SDimitry Andric   }
6397e8d8bef9SDimitry Andric 
6398bdd1243dSDimitry Andric   return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
6399e8d8bef9SDimitry Andric }
6400e8d8bef9SDimitry Andric 
6401e8d8bef9SDimitry Andric InstructionCost
getMemoryInstructionCost(Instruction * I,ElementCount VF)6402e8d8bef9SDimitry Andric LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6403e8d8bef9SDimitry Andric                                                      ElementCount VF) {
64040b57cec5SDimitry Andric   // Calculate scalar cost only. Vectorization cost should be ready at this
64050b57cec5SDimitry Andric   // moment.
6406e8d8bef9SDimitry Andric   if (VF.isScalar()) {
6407fe6060f1SDimitry Andric     Type *ValTy = getLoadStoreType(I);
64085ffd83dbSDimitry Andric     const Align Alignment = getLoadStoreAlignment(I);
64090b57cec5SDimitry Andric     unsigned AS = getLoadStoreAddressSpace(I);
64100b57cec5SDimitry Andric 
6411bdd1243dSDimitry Andric     TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
64120b57cec5SDimitry Andric     return TTI.getAddressComputationCost(ValTy) +
64135ffd83dbSDimitry Andric            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6414bdd1243dSDimitry Andric                                TTI::TCK_RecipThroughput, OpInfo, I);
64150b57cec5SDimitry Andric   }
64160b57cec5SDimitry Andric   return getWideningCost(I, VF);
64170b57cec5SDimitry Andric }
64180b57cec5SDimitry Andric 
64190b57cec5SDimitry Andric LoopVectorizationCostModel::VectorizationCostTy
getInstructionCost(Instruction * I,ElementCount VF)6420e8d8bef9SDimitry Andric LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6421e8d8bef9SDimitry Andric                                                ElementCount VF) {
64220b57cec5SDimitry Andric   // If we know that this instruction will remain uniform, check the cost of
64230b57cec5SDimitry Andric   // the scalar version.
64240b57cec5SDimitry Andric   if (isUniformAfterVectorization(I, VF))
6425e8d8bef9SDimitry Andric     VF = ElementCount::getFixed(1);
64260b57cec5SDimitry Andric 
6427e8d8bef9SDimitry Andric   if (VF.isVector() && isProfitableToScalarize(I, VF))
64280b57cec5SDimitry Andric     return VectorizationCostTy(InstsToScalarize[VF][I], false);
64290b57cec5SDimitry Andric 
64300b57cec5SDimitry Andric   // Forced scalars do not have any scalarization overhead.
64310b57cec5SDimitry Andric   auto ForcedScalar = ForcedScalars.find(VF);
6432e8d8bef9SDimitry Andric   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
64330b57cec5SDimitry Andric     auto InstSet = ForcedScalar->second;
64345ffd83dbSDimitry Andric     if (InstSet.count(I))
6435e8d8bef9SDimitry Andric       return VectorizationCostTy(
6436e8d8bef9SDimitry Andric           (getInstructionCost(I, ElementCount::getFixed(1)).first *
6437e8d8bef9SDimitry Andric            VF.getKnownMinValue()),
6438e8d8bef9SDimitry Andric           false);
64390b57cec5SDimitry Andric   }
64400b57cec5SDimitry Andric 
64410b57cec5SDimitry Andric   Type *VectorTy;
6442e8d8bef9SDimitry Andric   InstructionCost C = getInstructionCost(I, VF, VectorTy);
64430b57cec5SDimitry Andric 
6444349cc55cSDimitry Andric   bool TypeNotScalarized = false;
6445349cc55cSDimitry Andric   if (VF.isVector() && VectorTy->isVectorTy()) {
644681ad6265SDimitry Andric     if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) {
644781ad6265SDimitry Andric       if (VF.isScalable())
644881ad6265SDimitry Andric         // <vscale x 1 x iN> is assumed to be profitable over iN because
644981ad6265SDimitry Andric         // scalable registers are a distinct register class from scalar ones.
645081ad6265SDimitry Andric         // If we ever find a target which wants to lower scalable vectors
645181ad6265SDimitry Andric         // back to scalars, we'll need to update this code to explicitly
645281ad6265SDimitry Andric         // ask TTI about the register class uses for each part.
645381ad6265SDimitry Andric         TypeNotScalarized = NumParts <= VF.getKnownMinValue();
6454349cc55cSDimitry Andric       else
645581ad6265SDimitry Andric         TypeNotScalarized = NumParts < VF.getKnownMinValue();
645681ad6265SDimitry Andric     } else
6457349cc55cSDimitry Andric       C = InstructionCost::getInvalid();
6458349cc55cSDimitry Andric   }
64590b57cec5SDimitry Andric   return VectorizationCostTy(C, TypeNotScalarized);
64600b57cec5SDimitry Andric }
64610b57cec5SDimitry Andric 
getScalarizationOverhead(Instruction * I,ElementCount VF,TTI::TargetCostKind CostKind) const6462bdd1243dSDimitry Andric InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
6463bdd1243dSDimitry Andric     Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const {
64640b57cec5SDimitry Andric 
6465fe6060f1SDimitry Andric   // There is no mechanism yet to create a scalable scalarization loop,
6466fe6060f1SDimitry Andric   // so this is currently Invalid.
6467fe6060f1SDimitry Andric   if (VF.isScalable())
6468fe6060f1SDimitry Andric     return InstructionCost::getInvalid();
6469fe6060f1SDimitry Andric 
6470e8d8bef9SDimitry Andric   if (VF.isScalar())
64710b57cec5SDimitry Andric     return 0;
64720b57cec5SDimitry Andric 
6473e8d8bef9SDimitry Andric   InstructionCost Cost = 0;
64740b57cec5SDimitry Andric   Type *RetTy = ToVectorTy(I->getType(), VF);
64750b57cec5SDimitry Andric   if (!RetTy->isVoidTy() &&
64760b57cec5SDimitry Andric       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
64775ffd83dbSDimitry Andric     Cost += TTI.getScalarizationOverhead(
6478bdd1243dSDimitry Andric         cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
6479bdd1243dSDimitry Andric         /*Insert*/ true,
6480bdd1243dSDimitry Andric         /*Extract*/ false, CostKind);
64810b57cec5SDimitry Andric 
64820b57cec5SDimitry Andric   // Some targets keep addresses scalar.
64830b57cec5SDimitry Andric   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
64840b57cec5SDimitry Andric     return Cost;
64850b57cec5SDimitry Andric 
64860b57cec5SDimitry Andric   // Some targets support efficient element stores.
64870b57cec5SDimitry Andric   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
64880b57cec5SDimitry Andric     return Cost;
64890b57cec5SDimitry Andric 
64900b57cec5SDimitry Andric   // Collect operands to consider.
64910b57cec5SDimitry Andric   CallInst *CI = dyn_cast<CallInst>(I);
6492349cc55cSDimitry Andric   Instruction::op_range Ops = CI ? CI->args() : I->operands();
64930b57cec5SDimitry Andric 
64940b57cec5SDimitry Andric   // Skip operands that do not require extraction/scalarization and do not incur
64950b57cec5SDimitry Andric   // any overhead.
6496fe6060f1SDimitry Andric   SmallVector<Type *> Tys;
6497fe6060f1SDimitry Andric   for (auto *V : filterExtractingOperands(Ops, VF))
6498fe6060f1SDimitry Andric     Tys.push_back(MaybeVectorizeType(V->getType(), VF));
64990b57cec5SDimitry Andric   return Cost + TTI.getOperandsScalarizationOverhead(
6500bdd1243dSDimitry Andric                     filterExtractingOperands(Ops, VF), Tys, CostKind);
65010b57cec5SDimitry Andric }
65020b57cec5SDimitry Andric 
setCostBasedWideningDecision(ElementCount VF)6503e8d8bef9SDimitry Andric void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6504e8d8bef9SDimitry Andric   if (VF.isScalar())
65050b57cec5SDimitry Andric     return;
65060b57cec5SDimitry Andric   NumPredStores = 0;
65070b57cec5SDimitry Andric   for (BasicBlock *BB : TheLoop->blocks()) {
65080b57cec5SDimitry Andric     // For each instruction in the old loop.
65090b57cec5SDimitry Andric     for (Instruction &I : *BB) {
65100b57cec5SDimitry Andric       Value *Ptr =  getLoadStorePointerOperand(&I);
65110b57cec5SDimitry Andric       if (!Ptr)
65120b57cec5SDimitry Andric         continue;
65130b57cec5SDimitry Andric 
65140b57cec5SDimitry Andric       // TODO: We should generate better code and update the cost model for
65150b57cec5SDimitry Andric       // predicated uniform stores. Today they are treated as any other
65160b57cec5SDimitry Andric       // predicated store (see added test cases in
65170b57cec5SDimitry Andric       // invariant-store-vectorization.ll).
651804eeddc0SDimitry Andric       if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
65190b57cec5SDimitry Andric         NumPredStores++;
65200b57cec5SDimitry Andric 
6521fe013be4SDimitry Andric       if (Legal->isUniformMemOp(I, VF)) {
6522bdd1243dSDimitry Andric         auto isLegalToScalarize = [&]() {
6523bdd1243dSDimitry Andric           if (!VF.isScalable())
6524bdd1243dSDimitry Andric             // Scalarization of fixed length vectors "just works".
6525bdd1243dSDimitry Andric             return true;
6526bdd1243dSDimitry Andric 
6527bdd1243dSDimitry Andric           // We have dedicated lowering for unpredicated uniform loads and
6528bdd1243dSDimitry Andric           // stores.  Note that even with tail folding we know that at least
6529bdd1243dSDimitry Andric           // one lane is active (i.e. generalized predication is not possible
6530bdd1243dSDimitry Andric           // here), and the logic below depends on this fact.
6531bdd1243dSDimitry Andric           if (!foldTailByMasking())
6532bdd1243dSDimitry Andric             return true;
6533bdd1243dSDimitry Andric 
6534bdd1243dSDimitry Andric           // For scalable vectors, a uniform memop load is always
6535bdd1243dSDimitry Andric           // uniform-by-parts  and we know how to scalarize that.
6536bdd1243dSDimitry Andric           if (isa<LoadInst>(I))
6537bdd1243dSDimitry Andric             return true;
6538bdd1243dSDimitry Andric 
6539bdd1243dSDimitry Andric           // A uniform store isn't neccessarily uniform-by-part
6540bdd1243dSDimitry Andric           // and we can't assume scalarization.
6541bdd1243dSDimitry Andric           auto &SI = cast<StoreInst>(I);
6542bdd1243dSDimitry Andric           return TheLoop->isLoopInvariant(SI.getValueOperand());
6543bdd1243dSDimitry Andric         };
6544bdd1243dSDimitry Andric 
6545bdd1243dSDimitry Andric         const InstructionCost GatherScatterCost =
6546bdd1243dSDimitry Andric           isLegalGatherOrScatter(&I, VF) ?
6547bdd1243dSDimitry Andric           getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
6548bdd1243dSDimitry Andric 
65490b57cec5SDimitry Andric         // Load: Scalar load + broadcast
65500b57cec5SDimitry Andric         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6551bdd1243dSDimitry Andric         // FIXME: This cost is a significant under-estimate for tail folded
6552bdd1243dSDimitry Andric         // memory ops.
6553bdd1243dSDimitry Andric         const InstructionCost ScalarizationCost = isLegalToScalarize() ?
6554bdd1243dSDimitry Andric           getUniformMemOpCost(&I, VF) : InstructionCost::getInvalid();
6555bdd1243dSDimitry Andric 
6556bdd1243dSDimitry Andric         // Choose better solution for the current VF,  Note that Invalid
6557bdd1243dSDimitry Andric         // costs compare as maximumal large.  If both are invalid, we get
6558bdd1243dSDimitry Andric         // scalable invalid which signals a failure and a vectorization abort.
6559bdd1243dSDimitry Andric         if (GatherScatterCost < ScalarizationCost)
6560bdd1243dSDimitry Andric           setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
6561bdd1243dSDimitry Andric         else
6562bdd1243dSDimitry Andric           setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
65630b57cec5SDimitry Andric         continue;
65640b57cec5SDimitry Andric       }
65650b57cec5SDimitry Andric 
65660b57cec5SDimitry Andric       // We assume that widening is the best solution when possible.
65670b57cec5SDimitry Andric       if (memoryInstructionCanBeWidened(&I, VF)) {
6568e8d8bef9SDimitry Andric         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6569349cc55cSDimitry Andric         int ConsecutiveStride = Legal->isConsecutivePtr(
6570349cc55cSDimitry Andric             getLoadStoreType(&I), getLoadStorePointerOperand(&I));
65710b57cec5SDimitry Andric         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
65720b57cec5SDimitry Andric                "Expected consecutive stride.");
65730b57cec5SDimitry Andric         InstWidening Decision =
65740b57cec5SDimitry Andric             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
65750b57cec5SDimitry Andric         setWideningDecision(&I, VF, Decision, Cost);
65760b57cec5SDimitry Andric         continue;
65770b57cec5SDimitry Andric       }
65780b57cec5SDimitry Andric 
65790b57cec5SDimitry Andric       // Choose between Interleaving, Gather/Scatter or Scalarization.
6580fe6060f1SDimitry Andric       InstructionCost InterleaveCost = InstructionCost::getInvalid();
65810b57cec5SDimitry Andric       unsigned NumAccesses = 1;
65820b57cec5SDimitry Andric       if (isAccessInterleaved(&I)) {
65830b57cec5SDimitry Andric         auto Group = getInterleavedAccessGroup(&I);
65840b57cec5SDimitry Andric         assert(Group && "Fail to get an interleaved access group.");
65850b57cec5SDimitry Andric 
65860b57cec5SDimitry Andric         // Make one decision for the whole group.
65870b57cec5SDimitry Andric         if (getWideningDecision(&I, VF) != CM_Unknown)
65880b57cec5SDimitry Andric           continue;
65890b57cec5SDimitry Andric 
65900b57cec5SDimitry Andric         NumAccesses = Group->getNumMembers();
65910b57cec5SDimitry Andric         if (interleavedAccessCanBeWidened(&I, VF))
65920b57cec5SDimitry Andric           InterleaveCost = getInterleaveGroupCost(&I, VF);
65930b57cec5SDimitry Andric       }
65940b57cec5SDimitry Andric 
6595e8d8bef9SDimitry Andric       InstructionCost GatherScatterCost =
659604eeddc0SDimitry Andric           isLegalGatherOrScatter(&I, VF)
65970b57cec5SDimitry Andric               ? getGatherScatterCost(&I, VF) * NumAccesses
6598fe6060f1SDimitry Andric               : InstructionCost::getInvalid();
65990b57cec5SDimitry Andric 
6600e8d8bef9SDimitry Andric       InstructionCost ScalarizationCost =
66010b57cec5SDimitry Andric           getMemInstScalarizationCost(&I, VF) * NumAccesses;
66020b57cec5SDimitry Andric 
66030b57cec5SDimitry Andric       // Choose better solution for the current VF,
66040b57cec5SDimitry Andric       // write down this decision and use it during vectorization.
6605e8d8bef9SDimitry Andric       InstructionCost Cost;
66060b57cec5SDimitry Andric       InstWidening Decision;
66070b57cec5SDimitry Andric       if (InterleaveCost <= GatherScatterCost &&
66080b57cec5SDimitry Andric           InterleaveCost < ScalarizationCost) {
66090b57cec5SDimitry Andric         Decision = CM_Interleave;
66100b57cec5SDimitry Andric         Cost = InterleaveCost;
66110b57cec5SDimitry Andric       } else if (GatherScatterCost < ScalarizationCost) {
66120b57cec5SDimitry Andric         Decision = CM_GatherScatter;
66130b57cec5SDimitry Andric         Cost = GatherScatterCost;
66140b57cec5SDimitry Andric       } else {
66150b57cec5SDimitry Andric         Decision = CM_Scalarize;
66160b57cec5SDimitry Andric         Cost = ScalarizationCost;
66170b57cec5SDimitry Andric       }
66180b57cec5SDimitry Andric       // If the instructions belongs to an interleave group, the whole group
66190b57cec5SDimitry Andric       // receives the same decision. The whole group receives the cost, but
66200b57cec5SDimitry Andric       // the cost will actually be assigned to one instruction.
66210b57cec5SDimitry Andric       if (auto Group = getInterleavedAccessGroup(&I))
66220b57cec5SDimitry Andric         setWideningDecision(Group, VF, Decision, Cost);
66230b57cec5SDimitry Andric       else
66240b57cec5SDimitry Andric         setWideningDecision(&I, VF, Decision, Cost);
66250b57cec5SDimitry Andric     }
66260b57cec5SDimitry Andric   }
66270b57cec5SDimitry Andric 
66280b57cec5SDimitry Andric   // Make sure that any load of address and any other address computation
66290b57cec5SDimitry Andric   // remains scalar unless there is gather/scatter support. This avoids
66300b57cec5SDimitry Andric   // inevitable extracts into address registers, and also has the benefit of
66310b57cec5SDimitry Andric   // activating LSR more, since that pass can't optimize vectorized
66320b57cec5SDimitry Andric   // addresses.
66330b57cec5SDimitry Andric   if (TTI.prefersVectorizedAddressing())
66340b57cec5SDimitry Andric     return;
66350b57cec5SDimitry Andric 
66360b57cec5SDimitry Andric   // Start with all scalar pointer uses.
66370b57cec5SDimitry Andric   SmallPtrSet<Instruction *, 8> AddrDefs;
66380b57cec5SDimitry Andric   for (BasicBlock *BB : TheLoop->blocks())
66390b57cec5SDimitry Andric     for (Instruction &I : *BB) {
66400b57cec5SDimitry Andric       Instruction *PtrDef =
66410b57cec5SDimitry Andric         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
66420b57cec5SDimitry Andric       if (PtrDef && TheLoop->contains(PtrDef) &&
66430b57cec5SDimitry Andric           getWideningDecision(&I, VF) != CM_GatherScatter)
66440b57cec5SDimitry Andric         AddrDefs.insert(PtrDef);
66450b57cec5SDimitry Andric     }
66460b57cec5SDimitry Andric 
66470b57cec5SDimitry Andric   // Add all instructions used to generate the addresses.
66480b57cec5SDimitry Andric   SmallVector<Instruction *, 4> Worklist;
6649e8d8bef9SDimitry Andric   append_range(Worklist, AddrDefs);
66500b57cec5SDimitry Andric   while (!Worklist.empty()) {
66510b57cec5SDimitry Andric     Instruction *I = Worklist.pop_back_val();
66520b57cec5SDimitry Andric     for (auto &Op : I->operands())
66530b57cec5SDimitry Andric       if (auto *InstOp = dyn_cast<Instruction>(Op))
66540b57cec5SDimitry Andric         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
66550b57cec5SDimitry Andric             AddrDefs.insert(InstOp).second)
66560b57cec5SDimitry Andric           Worklist.push_back(InstOp);
66570b57cec5SDimitry Andric   }
66580b57cec5SDimitry Andric 
66590b57cec5SDimitry Andric   for (auto *I : AddrDefs) {
66600b57cec5SDimitry Andric     if (isa<LoadInst>(I)) {
66610b57cec5SDimitry Andric       // Setting the desired widening decision should ideally be handled in
66620b57cec5SDimitry Andric       // by cost functions, but since this involves the task of finding out
66630b57cec5SDimitry Andric       // if the loaded register is involved in an address computation, it is
66640b57cec5SDimitry Andric       // instead changed here when we know this is the case.
66650b57cec5SDimitry Andric       InstWidening Decision = getWideningDecision(I, VF);
66660b57cec5SDimitry Andric       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
66670b57cec5SDimitry Andric         // Scalarize a widened load of address.
6668e8d8bef9SDimitry Andric         setWideningDecision(
6669e8d8bef9SDimitry Andric             I, VF, CM_Scalarize,
6670e8d8bef9SDimitry Andric             (VF.getKnownMinValue() *
6671e8d8bef9SDimitry Andric              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
66720b57cec5SDimitry Andric       else if (auto Group = getInterleavedAccessGroup(I)) {
66730b57cec5SDimitry Andric         // Scalarize an interleave group of address loads.
66740b57cec5SDimitry Andric         for (unsigned I = 0; I < Group->getFactor(); ++I) {
66750b57cec5SDimitry Andric           if (Instruction *Member = Group->getMember(I))
6676e8d8bef9SDimitry Andric             setWideningDecision(
6677e8d8bef9SDimitry Andric                 Member, VF, CM_Scalarize,
6678e8d8bef9SDimitry Andric                 (VF.getKnownMinValue() *
6679e8d8bef9SDimitry Andric                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
66800b57cec5SDimitry Andric         }
66810b57cec5SDimitry Andric       }
66820b57cec5SDimitry Andric     } else
66830b57cec5SDimitry Andric       // Make sure I gets scalarized and a cost estimate without
66840b57cec5SDimitry Andric       // scalarization overhead.
66850b57cec5SDimitry Andric       ForcedScalars[VF].insert(I);
66860b57cec5SDimitry Andric   }
66870b57cec5SDimitry Andric }
66880b57cec5SDimitry Andric 
setVectorizedCallDecision(ElementCount VF)6689c9157d92SDimitry Andric void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
6690c9157d92SDimitry Andric   assert(!VF.isScalar() &&
6691c9157d92SDimitry Andric          "Trying to set a vectorization decision for a scalar VF");
6692c9157d92SDimitry Andric 
6693c9157d92SDimitry Andric   for (BasicBlock *BB : TheLoop->blocks()) {
6694c9157d92SDimitry Andric     // For each instruction in the old loop.
6695c9157d92SDimitry Andric     for (Instruction &I : *BB) {
6696c9157d92SDimitry Andric       CallInst *CI = dyn_cast<CallInst>(&I);
6697c9157d92SDimitry Andric 
6698c9157d92SDimitry Andric       if (!CI)
6699c9157d92SDimitry Andric         continue;
6700c9157d92SDimitry Andric 
6701c9157d92SDimitry Andric       InstructionCost ScalarCost = InstructionCost::getInvalid();
6702c9157d92SDimitry Andric       InstructionCost VectorCost = InstructionCost::getInvalid();
6703c9157d92SDimitry Andric       InstructionCost IntrinsicCost = InstructionCost::getInvalid();
6704c9157d92SDimitry Andric       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6705c9157d92SDimitry Andric 
6706c9157d92SDimitry Andric       Function *ScalarFunc = CI->getCalledFunction();
6707c9157d92SDimitry Andric       Type *ScalarRetTy = CI->getType();
6708c9157d92SDimitry Andric       SmallVector<Type *, 4> Tys, ScalarTys;
6709c9157d92SDimitry Andric       bool MaskRequired = Legal->isMaskRequired(CI);
6710c9157d92SDimitry Andric       for (auto &ArgOp : CI->args())
6711c9157d92SDimitry Andric         ScalarTys.push_back(ArgOp->getType());
6712c9157d92SDimitry Andric 
6713c9157d92SDimitry Andric       // Compute corresponding vector type for return value and arguments.
6714c9157d92SDimitry Andric       Type *RetTy = ToVectorTy(ScalarRetTy, VF);
6715c9157d92SDimitry Andric       for (Type *ScalarTy : ScalarTys)
6716c9157d92SDimitry Andric         Tys.push_back(ToVectorTy(ScalarTy, VF));
6717c9157d92SDimitry Andric 
6718c9157d92SDimitry Andric       // An in-loop reduction using an fmuladd intrinsic is a special case;
6719c9157d92SDimitry Andric       // we don't want the normal cost for that intrinsic.
6720c9157d92SDimitry Andric       if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
6721c9157d92SDimitry Andric         if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) {
6722c9157d92SDimitry Andric           setCallWideningDecision(CI, VF, CM_IntrinsicCall, nullptr,
6723c9157d92SDimitry Andric                                   getVectorIntrinsicIDForCall(CI, TLI),
6724c9157d92SDimitry Andric                                   std::nullopt, *RedCost);
6725c9157d92SDimitry Andric           continue;
6726c9157d92SDimitry Andric         }
6727c9157d92SDimitry Andric 
6728c9157d92SDimitry Andric       // Estimate cost of scalarized vector call. The source operands are
6729c9157d92SDimitry Andric       // assumed to be vectors, so we need to extract individual elements from
6730c9157d92SDimitry Andric       // there, execute VF scalar calls, and then gather the result into the
6731c9157d92SDimitry Andric       // vector return value.
6732c9157d92SDimitry Andric       InstructionCost ScalarCallCost =
6733c9157d92SDimitry Andric           TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
6734c9157d92SDimitry Andric 
6735c9157d92SDimitry Andric       // Compute costs of unpacking argument values for the scalar calls and
6736c9157d92SDimitry Andric       // packing the return values to a vector.
6737c9157d92SDimitry Andric       InstructionCost ScalarizationCost =
6738c9157d92SDimitry Andric           getScalarizationOverhead(CI, VF, CostKind);
6739c9157d92SDimitry Andric 
6740c9157d92SDimitry Andric       ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
6741c9157d92SDimitry Andric 
6742c9157d92SDimitry Andric       // Find the cost of vectorizing the call, if we can find a suitable
6743c9157d92SDimitry Andric       // vector variant of the function.
6744c9157d92SDimitry Andric       bool UsesMask = false;
6745c9157d92SDimitry Andric       VFInfo FuncInfo;
6746c9157d92SDimitry Andric       Function *VecFunc = nullptr;
6747c9157d92SDimitry Andric       // Search through any available variants for one we can use at this VF.
6748c9157d92SDimitry Andric       for (VFInfo &Info : VFDatabase::getMappings(*CI)) {
6749c9157d92SDimitry Andric         // Must match requested VF.
6750c9157d92SDimitry Andric         if (Info.Shape.VF != VF)
6751c9157d92SDimitry Andric           continue;
6752c9157d92SDimitry Andric 
6753c9157d92SDimitry Andric         // Must take a mask argument if one is required
6754c9157d92SDimitry Andric         if (MaskRequired && !Info.isMasked())
6755c9157d92SDimitry Andric           continue;
6756c9157d92SDimitry Andric 
6757c9157d92SDimitry Andric         // Check that all parameter kinds are supported
6758c9157d92SDimitry Andric         bool ParamsOk = true;
6759c9157d92SDimitry Andric         for (VFParameter Param : Info.Shape.Parameters) {
6760c9157d92SDimitry Andric           switch (Param.ParamKind) {
6761c9157d92SDimitry Andric           case VFParamKind::Vector:
6762c9157d92SDimitry Andric             break;
6763c9157d92SDimitry Andric           case VFParamKind::OMP_Uniform: {
6764c9157d92SDimitry Andric             Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6765c9157d92SDimitry Andric             // Make sure the scalar parameter in the loop is invariant.
6766c9157d92SDimitry Andric             if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),
6767c9157d92SDimitry Andric                                               TheLoop))
6768c9157d92SDimitry Andric               ParamsOk = false;
6769c9157d92SDimitry Andric             break;
6770c9157d92SDimitry Andric           }
6771c9157d92SDimitry Andric           case VFParamKind::OMP_Linear: {
6772c9157d92SDimitry Andric             Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6773c9157d92SDimitry Andric             // Find the stride for the scalar parameter in this loop and see if
6774c9157d92SDimitry Andric             // it matches the stride for the variant.
6775c9157d92SDimitry Andric             // TODO: do we need to figure out the cost of an extract to get the
6776c9157d92SDimitry Andric             // first lane? Or do we hope that it will be folded away?
6777c9157d92SDimitry Andric             ScalarEvolution *SE = PSE.getSE();
6778c9157d92SDimitry Andric             const auto *SAR =
6779c9157d92SDimitry Andric                 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(ScalarParam));
6780c9157d92SDimitry Andric 
6781c9157d92SDimitry Andric             if (!SAR || SAR->getLoop() != TheLoop) {
6782c9157d92SDimitry Andric               ParamsOk = false;
6783c9157d92SDimitry Andric               break;
6784c9157d92SDimitry Andric             }
6785c9157d92SDimitry Andric 
6786c9157d92SDimitry Andric             const SCEVConstant *Step =
6787c9157d92SDimitry Andric                 dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE));
6788c9157d92SDimitry Andric 
6789c9157d92SDimitry Andric             if (!Step ||
6790c9157d92SDimitry Andric                 Step->getAPInt().getSExtValue() != Param.LinearStepOrPos)
6791c9157d92SDimitry Andric               ParamsOk = false;
6792c9157d92SDimitry Andric 
6793c9157d92SDimitry Andric             break;
6794c9157d92SDimitry Andric           }
6795c9157d92SDimitry Andric           case VFParamKind::GlobalPredicate:
6796c9157d92SDimitry Andric             UsesMask = true;
6797c9157d92SDimitry Andric             break;
6798c9157d92SDimitry Andric           default:
6799c9157d92SDimitry Andric             ParamsOk = false;
6800c9157d92SDimitry Andric             break;
6801c9157d92SDimitry Andric           }
6802c9157d92SDimitry Andric         }
6803c9157d92SDimitry Andric 
6804c9157d92SDimitry Andric         if (!ParamsOk)
6805c9157d92SDimitry Andric           continue;
6806c9157d92SDimitry Andric 
6807c9157d92SDimitry Andric         // Found a suitable candidate, stop here.
6808c9157d92SDimitry Andric         VecFunc = CI->getModule()->getFunction(Info.VectorName);
6809c9157d92SDimitry Andric         FuncInfo = Info;
6810c9157d92SDimitry Andric         break;
6811c9157d92SDimitry Andric       }
6812c9157d92SDimitry Andric 
6813c9157d92SDimitry Andric       // Add in the cost of synthesizing a mask if one wasn't required.
6814c9157d92SDimitry Andric       InstructionCost MaskCost = 0;
6815c9157d92SDimitry Andric       if (VecFunc && UsesMask && !MaskRequired)
6816c9157d92SDimitry Andric         MaskCost = TTI.getShuffleCost(
6817c9157d92SDimitry Andric             TargetTransformInfo::SK_Broadcast,
6818c9157d92SDimitry Andric             VectorType::get(IntegerType::getInt1Ty(
6819c9157d92SDimitry Andric                                 VecFunc->getFunctionType()->getContext()),
6820c9157d92SDimitry Andric                             VF));
6821c9157d92SDimitry Andric 
6822c9157d92SDimitry Andric       if (TLI && VecFunc && !CI->isNoBuiltin())
6823c9157d92SDimitry Andric         VectorCost =
6824c9157d92SDimitry Andric             TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost;
6825c9157d92SDimitry Andric 
6826c9157d92SDimitry Andric       // Find the cost of an intrinsic; some targets may have instructions that
6827c9157d92SDimitry Andric       // perform the operation without needing an actual call.
6828c9157d92SDimitry Andric       Intrinsic::ID IID = getVectorIntrinsicIDForCall(CI, TLI);
6829c9157d92SDimitry Andric       if (IID != Intrinsic::not_intrinsic)
6830c9157d92SDimitry Andric         IntrinsicCost = getVectorIntrinsicCost(CI, VF);
6831c9157d92SDimitry Andric 
6832c9157d92SDimitry Andric       InstructionCost Cost = ScalarCost;
6833c9157d92SDimitry Andric       InstWidening Decision = CM_Scalarize;
6834c9157d92SDimitry Andric 
6835c9157d92SDimitry Andric       if (VectorCost <= Cost) {
6836c9157d92SDimitry Andric         Cost = VectorCost;
6837c9157d92SDimitry Andric         Decision = CM_VectorCall;
6838c9157d92SDimitry Andric       }
6839c9157d92SDimitry Andric 
6840c9157d92SDimitry Andric       if (IntrinsicCost <= Cost) {
6841c9157d92SDimitry Andric         Cost = IntrinsicCost;
6842c9157d92SDimitry Andric         Decision = CM_IntrinsicCall;
6843c9157d92SDimitry Andric       }
6844c9157d92SDimitry Andric 
6845c9157d92SDimitry Andric       setCallWideningDecision(CI, VF, Decision, VecFunc, IID,
6846c9157d92SDimitry Andric                               FuncInfo.getParamIndexForOptionalMask(), Cost);
6847c9157d92SDimitry Andric     }
6848c9157d92SDimitry Andric   }
6849c9157d92SDimitry Andric }
6850c9157d92SDimitry Andric 
6851e8d8bef9SDimitry Andric InstructionCost
getInstructionCost(Instruction * I,ElementCount VF,Type * & VectorTy)6852e8d8bef9SDimitry Andric LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
68530b57cec5SDimitry Andric                                                Type *&VectorTy) {
68540b57cec5SDimitry Andric   Type *RetTy = I->getType();
68550b57cec5SDimitry Andric   if (canTruncateToMinimalBitwidth(I, VF))
68560b57cec5SDimitry Andric     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
68570b57cec5SDimitry Andric   auto SE = PSE.getSE();
68585ffd83dbSDimitry Andric   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
68590b57cec5SDimitry Andric 
6860fe6060f1SDimitry Andric   auto hasSingleCopyAfterVectorization = [this](Instruction *I,
6861fe6060f1SDimitry Andric                                                 ElementCount VF) -> bool {
6862fe6060f1SDimitry Andric     if (VF.isScalar())
6863fe6060f1SDimitry Andric       return true;
6864fe6060f1SDimitry Andric 
6865fe6060f1SDimitry Andric     auto Scalarized = InstsToScalarize.find(VF);
6866fe6060f1SDimitry Andric     assert(Scalarized != InstsToScalarize.end() &&
6867fe6060f1SDimitry Andric            "VF not yet analyzed for scalarization profitability");
6868fe6060f1SDimitry Andric     return !Scalarized->second.count(I) &&
6869fe6060f1SDimitry Andric            llvm::all_of(I->users(), [&](User *U) {
6870fe6060f1SDimitry Andric              auto *UI = cast<Instruction>(U);
6871fe6060f1SDimitry Andric              return !Scalarized->second.count(UI);
6872fe6060f1SDimitry Andric            });
6873fe6060f1SDimitry Andric   };
6874fe6060f1SDimitry Andric   (void) hasSingleCopyAfterVectorization;
6875fe6060f1SDimitry Andric 
6876fe6060f1SDimitry Andric   if (isScalarAfterVectorization(I, VF)) {
6877fe6060f1SDimitry Andric     // With the exception of GEPs and PHIs, after scalarization there should
6878fe6060f1SDimitry Andric     // only be one copy of the instruction generated in the loop. This is
6879fe6060f1SDimitry Andric     // because the VF is either 1, or any instructions that need scalarizing
6880c9157d92SDimitry Andric     // have already been dealt with by the time we get here. As a result,
6881fe6060f1SDimitry Andric     // it means we don't have to multiply the instruction cost by VF.
6882fe6060f1SDimitry Andric     assert(I->getOpcode() == Instruction::GetElementPtr ||
6883fe6060f1SDimitry Andric            I->getOpcode() == Instruction::PHI ||
6884fe6060f1SDimitry Andric            (I->getOpcode() == Instruction::BitCast &&
6885fe6060f1SDimitry Andric             I->getType()->isPointerTy()) ||
6886fe6060f1SDimitry Andric            hasSingleCopyAfterVectorization(I, VF));
6887fe6060f1SDimitry Andric     VectorTy = RetTy;
6888fe6060f1SDimitry Andric   } else
6889fe6060f1SDimitry Andric     VectorTy = ToVectorTy(RetTy, VF);
6890fe6060f1SDimitry Andric 
68910b57cec5SDimitry Andric   // TODO: We need to estimate the cost of intrinsic calls.
68920b57cec5SDimitry Andric   switch (I->getOpcode()) {
68930b57cec5SDimitry Andric   case Instruction::GetElementPtr:
68940b57cec5SDimitry Andric     // We mark this instruction as zero-cost because the cost of GEPs in
68950b57cec5SDimitry Andric     // vectorized code depends on whether the corresponding memory instruction
68960b57cec5SDimitry Andric     // is scalarized or not. Therefore, we handle GEPs with the memory
68970b57cec5SDimitry Andric     // instruction cost.
68980b57cec5SDimitry Andric     return 0;
68990b57cec5SDimitry Andric   case Instruction::Br: {
69000b57cec5SDimitry Andric     // In cases of scalarized and predicated instructions, there will be VF
69010b57cec5SDimitry Andric     // predicated blocks in the vectorized loop. Each branch around these
69020b57cec5SDimitry Andric     // blocks requires also an extract of its vector compare i1 element.
69030b57cec5SDimitry Andric     bool ScalarPredicatedBB = false;
69040b57cec5SDimitry Andric     BranchInst *BI = cast<BranchInst>(I);
6905e8d8bef9SDimitry Andric     if (VF.isVector() && BI->isConditional() &&
6906753f127fSDimitry Andric         (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
6907753f127fSDimitry Andric          PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))))
69080b57cec5SDimitry Andric       ScalarPredicatedBB = true;
69090b57cec5SDimitry Andric 
69100b57cec5SDimitry Andric     if (ScalarPredicatedBB) {
6911fe6060f1SDimitry Andric       // Not possible to scalarize scalable vector with predicated instructions.
6912fe6060f1SDimitry Andric       if (VF.isScalable())
6913fe6060f1SDimitry Andric         return InstructionCost::getInvalid();
69140b57cec5SDimitry Andric       // Return cost for branches around scalarized and predicated blocks.
69155ffd83dbSDimitry Andric       auto *Vec_i1Ty =
6916e8d8bef9SDimitry Andric           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6917fe6060f1SDimitry Andric       return (
6918fe6060f1SDimitry Andric           TTI.getScalarizationOverhead(
6919bdd1243dSDimitry Andric               Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()),
6920bdd1243dSDimitry Andric               /*Insert*/ false, /*Extract*/ true, CostKind) +
6921fe6060f1SDimitry Andric           (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
6922e8d8bef9SDimitry Andric     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
69230b57cec5SDimitry Andric       // The back-edge branch will remain, as will all scalar branches.
69245ffd83dbSDimitry Andric       return TTI.getCFInstrCost(Instruction::Br, CostKind);
69250b57cec5SDimitry Andric     else
69260b57cec5SDimitry Andric       // This branch will be eliminated by if-conversion.
69270b57cec5SDimitry Andric       return 0;
69280b57cec5SDimitry Andric     // Note: We currently assume zero cost for an unconditional branch inside
69290b57cec5SDimitry Andric     // a predicated block since it will become a fall-through, although we
69300b57cec5SDimitry Andric     // may decide in the future to call TTI for all branches.
69310b57cec5SDimitry Andric   }
69320b57cec5SDimitry Andric   case Instruction::PHI: {
69330b57cec5SDimitry Andric     auto *Phi = cast<PHINode>(I);
69340b57cec5SDimitry Andric 
69350b57cec5SDimitry Andric     // First-order recurrences are replaced by vector shuffles inside the loop.
6936bdd1243dSDimitry Andric     if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6937bdd1243dSDimitry Andric       SmallVector<int> Mask(VF.getKnownMinValue());
6938bdd1243dSDimitry Andric       std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
6939bdd1243dSDimitry Andric       return TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
6940bdd1243dSDimitry Andric                                 cast<VectorType>(VectorTy), Mask, CostKind,
6941bdd1243dSDimitry Andric                                 VF.getKnownMinValue() - 1);
6942bdd1243dSDimitry Andric     }
69430b57cec5SDimitry Andric 
69440b57cec5SDimitry Andric     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
69450b57cec5SDimitry Andric     // converted into select instructions. We require N - 1 selects per phi
69460b57cec5SDimitry Andric     // node, where N is the number of incoming values.
6947e8d8bef9SDimitry Andric     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
69480b57cec5SDimitry Andric       return (Phi->getNumIncomingValues() - 1) *
69490b57cec5SDimitry Andric              TTI.getCmpSelInstrCost(
69500b57cec5SDimitry Andric                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
69515ffd83dbSDimitry Andric                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6952e8d8bef9SDimitry Andric                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
69530b57cec5SDimitry Andric 
69545ffd83dbSDimitry Andric     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
69550b57cec5SDimitry Andric   }
69560b57cec5SDimitry Andric   case Instruction::UDiv:
69570b57cec5SDimitry Andric   case Instruction::SDiv:
69580b57cec5SDimitry Andric   case Instruction::URem:
69590b57cec5SDimitry Andric   case Instruction::SRem:
6960bdd1243dSDimitry Andric     if (VF.isVector() && isPredicatedInst(I)) {
6961bdd1243dSDimitry Andric       const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6962bdd1243dSDimitry Andric       return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6963bdd1243dSDimitry Andric         ScalarCost : SafeDivisorCost;
69640b57cec5SDimitry Andric     }
6965bdd1243dSDimitry Andric     // We've proven all lanes safe to speculate, fall through.
6966bdd1243dSDimitry Andric     [[fallthrough]];
69670b57cec5SDimitry Andric   case Instruction::Add:
69680b57cec5SDimitry Andric   case Instruction::FAdd:
69690b57cec5SDimitry Andric   case Instruction::Sub:
69700b57cec5SDimitry Andric   case Instruction::FSub:
69710b57cec5SDimitry Andric   case Instruction::Mul:
69720b57cec5SDimitry Andric   case Instruction::FMul:
69730b57cec5SDimitry Andric   case Instruction::FDiv:
69740b57cec5SDimitry Andric   case Instruction::FRem:
69750b57cec5SDimitry Andric   case Instruction::Shl:
69760b57cec5SDimitry Andric   case Instruction::LShr:
69770b57cec5SDimitry Andric   case Instruction::AShr:
69780b57cec5SDimitry Andric   case Instruction::And:
69790b57cec5SDimitry Andric   case Instruction::Or:
69800b57cec5SDimitry Andric   case Instruction::Xor: {
6981fe013be4SDimitry Andric     // If we're speculating on the stride being 1, the multiplication may
6982fe013be4SDimitry Andric     // fold away.  We can generalize this for all operations using the notion
6983fe013be4SDimitry Andric     // of neutral elements.  (TODO)
6984fe013be4SDimitry Andric     if (I->getOpcode() == Instruction::Mul &&
6985fe013be4SDimitry Andric         (PSE.getSCEV(I->getOperand(0))->isOne() ||
6986fe013be4SDimitry Andric          PSE.getSCEV(I->getOperand(1))->isOne()))
69870b57cec5SDimitry Andric       return 0;
6988e8d8bef9SDimitry Andric 
6989e8d8bef9SDimitry Andric     // Detect reduction patterns
6990fe6060f1SDimitry Andric     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
6991fe6060f1SDimitry Andric       return *RedCost;
6992e8d8bef9SDimitry Andric 
69930b57cec5SDimitry Andric     // Certain instructions can be cheaper to vectorize if they have a constant
69940b57cec5SDimitry Andric     // second vector operand. One example of this are shifts on x86.
69950b57cec5SDimitry Andric     Value *Op2 = I->getOperand(1);
6996bdd1243dSDimitry Andric     auto Op2Info = TTI.getOperandInfo(Op2);
6997fe013be4SDimitry Andric     if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6998fe013be4SDimitry Andric         Legal->isInvariant(Op2))
6999bdd1243dSDimitry Andric       Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
70000b57cec5SDimitry Andric 
70010b57cec5SDimitry Andric     SmallVector<const Value *, 4> Operands(I->operand_values());
7002a58f00eaSDimitry Andric     auto InstrCost = TTI.getArithmeticInstrCost(
7003bdd1243dSDimitry Andric         I->getOpcode(), VectorTy, CostKind,
7004bdd1243dSDimitry Andric         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
7005bdd1243dSDimitry Andric         Op2Info, Operands, I);
7006a58f00eaSDimitry Andric 
7007a58f00eaSDimitry Andric     // Some targets can replace frem with vector library calls.
7008a58f00eaSDimitry Andric     InstructionCost VecCallCost = InstructionCost::getInvalid();
7009a58f00eaSDimitry Andric     if (I->getOpcode() == Instruction::FRem) {
7010a58f00eaSDimitry Andric       LibFunc Func;
7011a58f00eaSDimitry Andric       if (TLI->getLibFunc(I->getOpcode(), I->getType(), Func) &&
7012a58f00eaSDimitry Andric           TLI->isFunctionVectorizable(TLI->getName(Func), VF)) {
7013a58f00eaSDimitry Andric         SmallVector<Type *, 4> OpTypes;
7014a58f00eaSDimitry Andric         for (auto &Op : I->operands())
7015a58f00eaSDimitry Andric           OpTypes.push_back(Op->getType());
7016a58f00eaSDimitry Andric         VecCallCost =
7017a58f00eaSDimitry Andric             TTI.getCallInstrCost(nullptr, VectorTy, OpTypes, CostKind);
7018a58f00eaSDimitry Andric       }
7019a58f00eaSDimitry Andric     }
7020a58f00eaSDimitry Andric     return std::min(InstrCost, VecCallCost);
70210b57cec5SDimitry Andric   }
70220b57cec5SDimitry Andric   case Instruction::FNeg: {
7023fe6060f1SDimitry Andric     return TTI.getArithmeticInstrCost(
7024bdd1243dSDimitry Andric         I->getOpcode(), VectorTy, CostKind,
7025bdd1243dSDimitry Andric         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
7026bdd1243dSDimitry Andric         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
7027bdd1243dSDimitry Andric         I->getOperand(0), I);
70280b57cec5SDimitry Andric   }
70290b57cec5SDimitry Andric   case Instruction::Select: {
70300b57cec5SDimitry Andric     SelectInst *SI = cast<SelectInst>(I);
70310b57cec5SDimitry Andric     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
70320b57cec5SDimitry Andric     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7033fe6060f1SDimitry Andric 
7034fe6060f1SDimitry Andric     const Value *Op0, *Op1;
7035fe6060f1SDimitry Andric     using namespace llvm::PatternMatch;
7036fe6060f1SDimitry Andric     if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
7037fe6060f1SDimitry Andric                         match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
7038fe6060f1SDimitry Andric       // select x, y, false --> x & y
7039fe6060f1SDimitry Andric       // select x, true, y --> x | y
7040bdd1243dSDimitry Andric       const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);
7041bdd1243dSDimitry Andric       const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);
7042fe6060f1SDimitry Andric       assert(Op0->getType()->getScalarSizeInBits() == 1 &&
7043fe6060f1SDimitry Andric               Op1->getType()->getScalarSizeInBits() == 1);
7044fe6060f1SDimitry Andric 
7045fe6060f1SDimitry Andric       SmallVector<const Value *, 2> Operands{Op0, Op1};
7046fe6060f1SDimitry Andric       return TTI.getArithmeticInstrCost(
7047fe6060f1SDimitry Andric           match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
7048bdd1243dSDimitry Andric           CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I);
7049fe6060f1SDimitry Andric     }
7050fe6060f1SDimitry Andric 
70510b57cec5SDimitry Andric     Type *CondTy = SI->getCondition()->getType();
70520b57cec5SDimitry Andric     if (!ScalarCond)
7053e8d8bef9SDimitry Andric       CondTy = VectorType::get(CondTy, VF);
70540eae32dcSDimitry Andric 
70550eae32dcSDimitry Andric     CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
70560eae32dcSDimitry Andric     if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
70570eae32dcSDimitry Andric       Pred = Cmp->getPredicate();
70580eae32dcSDimitry Andric     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
70590eae32dcSDimitry Andric                                   CostKind, I);
70600b57cec5SDimitry Andric   }
70610b57cec5SDimitry Andric   case Instruction::ICmp:
70620b57cec5SDimitry Andric   case Instruction::FCmp: {
70630b57cec5SDimitry Andric     Type *ValTy = I->getOperand(0)->getType();
70640b57cec5SDimitry Andric     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
70650b57cec5SDimitry Andric     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
70660b57cec5SDimitry Andric       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
70670b57cec5SDimitry Andric     VectorTy = ToVectorTy(ValTy, VF);
7068e8d8bef9SDimitry Andric     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
70690eae32dcSDimitry Andric                                   cast<CmpInst>(I)->getPredicate(), CostKind,
70700eae32dcSDimitry Andric                                   I);
70710b57cec5SDimitry Andric   }
70720b57cec5SDimitry Andric   case Instruction::Store:
70730b57cec5SDimitry Andric   case Instruction::Load: {
7074e8d8bef9SDimitry Andric     ElementCount Width = VF;
7075e8d8bef9SDimitry Andric     if (Width.isVector()) {
70760b57cec5SDimitry Andric       InstWidening Decision = getWideningDecision(I, Width);
70770b57cec5SDimitry Andric       assert(Decision != CM_Unknown &&
70780b57cec5SDimitry Andric              "CM decision should be taken at this point");
7079fcaf7f86SDimitry Andric       if (getWideningCost(I, VF) == InstructionCost::getInvalid())
708081ad6265SDimitry Andric         return InstructionCost::getInvalid();
7081fcaf7f86SDimitry Andric       if (Decision == CM_Scalarize)
7082e8d8bef9SDimitry Andric         Width = ElementCount::getFixed(1);
70830b57cec5SDimitry Andric     }
7084fe6060f1SDimitry Andric     VectorTy = ToVectorTy(getLoadStoreType(I), Width);
70850b57cec5SDimitry Andric     return getMemoryInstructionCost(I, VF);
70860b57cec5SDimitry Andric   }
7087fe6060f1SDimitry Andric   case Instruction::BitCast:
7088fe6060f1SDimitry Andric     if (I->getType()->isPointerTy())
7089fe6060f1SDimitry Andric       return 0;
7090bdd1243dSDimitry Andric     [[fallthrough]];
70910b57cec5SDimitry Andric   case Instruction::ZExt:
70920b57cec5SDimitry Andric   case Instruction::SExt:
70930b57cec5SDimitry Andric   case Instruction::FPToUI:
70940b57cec5SDimitry Andric   case Instruction::FPToSI:
70950b57cec5SDimitry Andric   case Instruction::FPExt:
70960b57cec5SDimitry Andric   case Instruction::PtrToInt:
70970b57cec5SDimitry Andric   case Instruction::IntToPtr:
70980b57cec5SDimitry Andric   case Instruction::SIToFP:
70990b57cec5SDimitry Andric   case Instruction::UIToFP:
71000b57cec5SDimitry Andric   case Instruction::Trunc:
7101fe6060f1SDimitry Andric   case Instruction::FPTrunc: {
7102e8d8bef9SDimitry Andric     // Computes the CastContextHint from a Load/Store instruction.
7103e8d8bef9SDimitry Andric     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7104e8d8bef9SDimitry Andric       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7105e8d8bef9SDimitry Andric              "Expected a load or a store!");
7106e8d8bef9SDimitry Andric 
7107e8d8bef9SDimitry Andric       if (VF.isScalar() || !TheLoop->contains(I))
7108e8d8bef9SDimitry Andric         return TTI::CastContextHint::Normal;
7109e8d8bef9SDimitry Andric 
7110e8d8bef9SDimitry Andric       switch (getWideningDecision(I, VF)) {
7111e8d8bef9SDimitry Andric       case LoopVectorizationCostModel::CM_GatherScatter:
7112e8d8bef9SDimitry Andric         return TTI::CastContextHint::GatherScatter;
7113e8d8bef9SDimitry Andric       case LoopVectorizationCostModel::CM_Interleave:
7114e8d8bef9SDimitry Andric         return TTI::CastContextHint::Interleave;
7115e8d8bef9SDimitry Andric       case LoopVectorizationCostModel::CM_Scalarize:
7116e8d8bef9SDimitry Andric       case LoopVectorizationCostModel::CM_Widen:
7117e8d8bef9SDimitry Andric         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7118e8d8bef9SDimitry Andric                                         : TTI::CastContextHint::Normal;
7119e8d8bef9SDimitry Andric       case LoopVectorizationCostModel::CM_Widen_Reverse:
7120e8d8bef9SDimitry Andric         return TTI::CastContextHint::Reversed;
7121e8d8bef9SDimitry Andric       case LoopVectorizationCostModel::CM_Unknown:
7122e8d8bef9SDimitry Andric         llvm_unreachable("Instr did not go through cost modelling?");
7123c9157d92SDimitry Andric       case LoopVectorizationCostModel::CM_VectorCall:
7124c9157d92SDimitry Andric       case LoopVectorizationCostModel::CM_IntrinsicCall:
7125c9157d92SDimitry Andric         llvm_unreachable_internal("Instr has invalid widening decision");
7126e8d8bef9SDimitry Andric       }
7127e8d8bef9SDimitry Andric 
7128e8d8bef9SDimitry Andric       llvm_unreachable("Unhandled case!");
7129e8d8bef9SDimitry Andric     };
7130e8d8bef9SDimitry Andric 
7131e8d8bef9SDimitry Andric     unsigned Opcode = I->getOpcode();
7132e8d8bef9SDimitry Andric     TTI::CastContextHint CCH = TTI::CastContextHint::None;
7133e8d8bef9SDimitry Andric     // For Trunc, the context is the only user, which must be a StoreInst.
7134e8d8bef9SDimitry Andric     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7135e8d8bef9SDimitry Andric       if (I->hasOneUse())
7136e8d8bef9SDimitry Andric         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7137e8d8bef9SDimitry Andric           CCH = ComputeCCH(Store);
7138e8d8bef9SDimitry Andric     }
7139e8d8bef9SDimitry Andric     // For Z/Sext, the context is the operand, which must be a LoadInst.
7140e8d8bef9SDimitry Andric     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7141e8d8bef9SDimitry Andric              Opcode == Instruction::FPExt) {
7142e8d8bef9SDimitry Andric       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7143e8d8bef9SDimitry Andric         CCH = ComputeCCH(Load);
7144e8d8bef9SDimitry Andric     }
7145e8d8bef9SDimitry Andric 
71460b57cec5SDimitry Andric     // We optimize the truncation of induction variables having constant
71470b57cec5SDimitry Andric     // integer steps. The cost of these truncations is the same as the scalar
71480b57cec5SDimitry Andric     // operation.
71490b57cec5SDimitry Andric     if (isOptimizableIVTruncate(I, VF)) {
71500b57cec5SDimitry Andric       auto *Trunc = cast<TruncInst>(I);
71510b57cec5SDimitry Andric       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7152e8d8bef9SDimitry Andric                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
71530b57cec5SDimitry Andric     }
71540b57cec5SDimitry Andric 
7155e8d8bef9SDimitry Andric     // Detect reduction patterns
7156fe6060f1SDimitry Andric     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7157fe6060f1SDimitry Andric       return *RedCost;
7158e8d8bef9SDimitry Andric 
71590b57cec5SDimitry Andric     Type *SrcScalarTy = I->getOperand(0)->getType();
71600b57cec5SDimitry Andric     Type *SrcVecTy =
71610b57cec5SDimitry Andric         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
71620b57cec5SDimitry Andric     if (canTruncateToMinimalBitwidth(I, VF)) {
71630b57cec5SDimitry Andric       // This cast is going to be shrunk. This may remove the cast or it might
71640b57cec5SDimitry Andric       // turn it into slightly different cast. For example, if MinBW == 16,
71650b57cec5SDimitry Andric       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
71660b57cec5SDimitry Andric       //
71670b57cec5SDimitry Andric       // Calculate the modified src and dest types.
71680b57cec5SDimitry Andric       Type *MinVecTy = VectorTy;
7169e8d8bef9SDimitry Andric       if (Opcode == Instruction::Trunc) {
71700b57cec5SDimitry Andric         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
71710b57cec5SDimitry Andric         VectorTy =
71720b57cec5SDimitry Andric             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7173e8d8bef9SDimitry Andric       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7174fe013be4SDimitry Andric         // Leave SrcVecTy unchanged - we only shrink the destination element
7175fe013be4SDimitry Andric         // type.
71760b57cec5SDimitry Andric         VectorTy =
71770b57cec5SDimitry Andric             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
71780b57cec5SDimitry Andric       }
71790b57cec5SDimitry Andric     }
71800b57cec5SDimitry Andric 
7181fe6060f1SDimitry Andric     return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
71820b57cec5SDimitry Andric   }
7183c9157d92SDimitry Andric   case Instruction::Call:
7184c9157d92SDimitry Andric     return getVectorCallCost(cast<CallInst>(I), VF);
7185e8d8bef9SDimitry Andric   case Instruction::ExtractValue:
7186e8d8bef9SDimitry Andric     return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7187fe6060f1SDimitry Andric   case Instruction::Alloca:
7188fe6060f1SDimitry Andric     // We cannot easily widen alloca to a scalable alloca, as
7189fe6060f1SDimitry Andric     // the result would need to be a vector of pointers.
7190fe6060f1SDimitry Andric     if (VF.isScalable())
7191fe6060f1SDimitry Andric       return InstructionCost::getInvalid();
7192bdd1243dSDimitry Andric     [[fallthrough]];
71930b57cec5SDimitry Andric   default:
7194fe6060f1SDimitry Andric     // This opcode is unknown. Assume that it is the same as 'mul'.
7195fe6060f1SDimitry Andric     return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
71960b57cec5SDimitry Andric   } // end of switch.
71970b57cec5SDimitry Andric }
71980b57cec5SDimitry Andric 
collectValuesToIgnore()71990b57cec5SDimitry Andric void LoopVectorizationCostModel::collectValuesToIgnore() {
72000b57cec5SDimitry Andric   // Ignore ephemeral values.
72010b57cec5SDimitry Andric   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
72020b57cec5SDimitry Andric 
720381ad6265SDimitry Andric   // Find all stores to invariant variables. Since they are going to sink
720481ad6265SDimitry Andric   // outside the loop we do not need calculate cost for them.
720581ad6265SDimitry Andric   for (BasicBlock *BB : TheLoop->blocks())
720681ad6265SDimitry Andric     for (Instruction &I : *BB) {
720781ad6265SDimitry Andric       StoreInst *SI;
720881ad6265SDimitry Andric       if ((SI = dyn_cast<StoreInst>(&I)) &&
720981ad6265SDimitry Andric           Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
721081ad6265SDimitry Andric         ValuesToIgnore.insert(&I);
721181ad6265SDimitry Andric     }
721281ad6265SDimitry Andric 
72130b57cec5SDimitry Andric   // Ignore type-promoting instructions we identified during reduction
72140b57cec5SDimitry Andric   // detection.
7215bdd1243dSDimitry Andric   for (const auto &Reduction : Legal->getReductionVars()) {
72160eae32dcSDimitry Andric     const RecurrenceDescriptor &RedDes = Reduction.second;
7217e8d8bef9SDimitry Andric     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
72180b57cec5SDimitry Andric     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
72190b57cec5SDimitry Andric   }
72200b57cec5SDimitry Andric   // Ignore type-casting instructions we identified during induction
72210b57cec5SDimitry Andric   // detection.
7222bdd1243dSDimitry Andric   for (const auto &Induction : Legal->getInductionVars()) {
72230eae32dcSDimitry Andric     const InductionDescriptor &IndDes = Induction.second;
72240b57cec5SDimitry Andric     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
72250b57cec5SDimitry Andric     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
72260b57cec5SDimitry Andric   }
72270b57cec5SDimitry Andric }
72280b57cec5SDimitry Andric 
collectInLoopReductions()7229e8d8bef9SDimitry Andric void LoopVectorizationCostModel::collectInLoopReductions() {
7230bdd1243dSDimitry Andric   for (const auto &Reduction : Legal->getReductionVars()) {
7231e8d8bef9SDimitry Andric     PHINode *Phi = Reduction.first;
72320eae32dcSDimitry Andric     const RecurrenceDescriptor &RdxDesc = Reduction.second;
7233e8d8bef9SDimitry Andric 
7234e8d8bef9SDimitry Andric     // We don't collect reductions that are type promoted (yet).
7235e8d8bef9SDimitry Andric     if (RdxDesc.getRecurrenceType() != Phi->getType())
7236e8d8bef9SDimitry Andric       continue;
7237e8d8bef9SDimitry Andric 
7238e8d8bef9SDimitry Andric     // If the target would prefer this reduction to happen "in-loop", then we
7239e8d8bef9SDimitry Andric     // want to record it as such.
7240e8d8bef9SDimitry Andric     unsigned Opcode = RdxDesc.getOpcode();
7241fe6060f1SDimitry Andric     if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7242e8d8bef9SDimitry Andric         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7243e8d8bef9SDimitry Andric                                    TargetTransformInfo::ReductionFlags()))
7244e8d8bef9SDimitry Andric       continue;
7245e8d8bef9SDimitry Andric 
7246e8d8bef9SDimitry Andric     // Check that we can correctly put the reductions into the loop, by
7247e8d8bef9SDimitry Andric     // finding the chain of operations that leads from the phi to the loop
7248e8d8bef9SDimitry Andric     // exit value.
7249e8d8bef9SDimitry Andric     SmallVector<Instruction *, 4> ReductionOperations =
7250e8d8bef9SDimitry Andric         RdxDesc.getReductionOpChain(Phi, TheLoop);
7251e8d8bef9SDimitry Andric     bool InLoop = !ReductionOperations.empty();
7252c9157d92SDimitry Andric 
7253e8d8bef9SDimitry Andric     if (InLoop) {
7254c9157d92SDimitry Andric       InLoopReductions.insert(Phi);
7255e8d8bef9SDimitry Andric       // Add the elements to InLoopReductionImmediateChains for cost modelling.
7256e8d8bef9SDimitry Andric       Instruction *LastChain = Phi;
7257e8d8bef9SDimitry Andric       for (auto *I : ReductionOperations) {
7258e8d8bef9SDimitry Andric         InLoopReductionImmediateChains[I] = LastChain;
7259e8d8bef9SDimitry Andric         LastChain = I;
7260e8d8bef9SDimitry Andric       }
7261e8d8bef9SDimitry Andric     }
7262e8d8bef9SDimitry Andric     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7263e8d8bef9SDimitry Andric                       << " reduction for phi: " << *Phi << "\n");
7264e8d8bef9SDimitry Andric   }
7265e8d8bef9SDimitry Andric }
7266e8d8bef9SDimitry Andric 
createICmp(CmpInst::Predicate Pred,VPValue * A,VPValue * B,DebugLoc DL,const Twine & Name)7267c9157d92SDimitry Andric VPValue *VPBuilder::createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B,
7268c9157d92SDimitry Andric                                DebugLoc DL, const Twine &Name) {
7269c9157d92SDimitry Andric   assert(Pred >= CmpInst::FIRST_ICMP_PREDICATE &&
7270c9157d92SDimitry Andric          Pred <= CmpInst::LAST_ICMP_PREDICATE && "invalid predicate");
7271c9157d92SDimitry Andric   return tryInsertInstruction(
7272c9157d92SDimitry Andric       new VPInstruction(Instruction::ICmp, Pred, A, B, DL, Name));
7273c9157d92SDimitry Andric }
7274c9157d92SDimitry Andric 
7275c9157d92SDimitry Andric // This function will select a scalable VF if the target supports scalable
7276c9157d92SDimitry Andric // vectors and a fixed one otherwise.
72770b57cec5SDimitry Andric // TODO: we could return a pair of values that specify the max VF and
72780b57cec5SDimitry Andric // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
72790b57cec5SDimitry Andric // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
72800b57cec5SDimitry Andric // doesn't have a cost model that can choose which plan to execute if
72810b57cec5SDimitry Andric // more than one is generated.
determineVPlanVF(const TargetTransformInfo & TTI,LoopVectorizationCostModel & CM)7282c9157d92SDimitry Andric static ElementCount determineVPlanVF(const TargetTransformInfo &TTI,
72830b57cec5SDimitry Andric                                      LoopVectorizationCostModel &CM) {
72840b57cec5SDimitry Andric   unsigned WidestType;
72850b57cec5SDimitry Andric   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7286c9157d92SDimitry Andric 
7287c9157d92SDimitry Andric   TargetTransformInfo::RegisterKind RegKind =
7288c9157d92SDimitry Andric       TTI.enableScalableVectorization()
7289c9157d92SDimitry Andric           ? TargetTransformInfo::RGK_ScalableVector
7290c9157d92SDimitry Andric           : TargetTransformInfo::RGK_FixedWidthVector;
7291c9157d92SDimitry Andric 
7292c9157d92SDimitry Andric   TypeSize RegSize = TTI.getRegisterBitWidth(RegKind);
7293c9157d92SDimitry Andric   unsigned N = RegSize.getKnownMinValue() / WidestType;
7294c9157d92SDimitry Andric   return ElementCount::get(N, RegSize.isScalable());
72950b57cec5SDimitry Andric }
72960b57cec5SDimitry Andric 
72970b57cec5SDimitry Andric VectorizationFactor
planInVPlanNativePath(ElementCount UserVF)7298e8d8bef9SDimitry Andric LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7299e8d8bef9SDimitry Andric   ElementCount VF = UserVF;
73000b57cec5SDimitry Andric   // Outer loop handling: They may require CFG and instruction level
73010b57cec5SDimitry Andric   // transformations before even evaluating whether vectorization is profitable.
73020b57cec5SDimitry Andric   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
73030b57cec5SDimitry Andric   // the vectorization pipeline.
7304e8d8bef9SDimitry Andric   if (!OrigLoop->isInnermost()) {
73050b57cec5SDimitry Andric     // If the user doesn't provide a vectorization factor, determine a
73060b57cec5SDimitry Andric     // reasonable one.
7307e8d8bef9SDimitry Andric     if (UserVF.isZero()) {
7308c9157d92SDimitry Andric       VF = determineVPlanVF(TTI, CM);
73090b57cec5SDimitry Andric       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
73100b57cec5SDimitry Andric 
73110b57cec5SDimitry Andric       // Make sure we have a VF > 1 for stress testing.
7312e8d8bef9SDimitry Andric       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
73130b57cec5SDimitry Andric         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
73140b57cec5SDimitry Andric                           << "overriding computed VF.\n");
7315e8d8bef9SDimitry Andric         VF = ElementCount::getFixed(4);
73160b57cec5SDimitry Andric       }
7317c9157d92SDimitry Andric     } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
7318c9157d92SDimitry Andric                !ForceTargetSupportsScalableVectors) {
7319c9157d92SDimitry Andric       LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
7320c9157d92SDimitry Andric                         << "not supported by the target.\n");
7321c9157d92SDimitry Andric       reportVectorizationFailure(
7322c9157d92SDimitry Andric           "Scalable vectorization requested but not supported by the target",
7323c9157d92SDimitry Andric           "the scalable user-specified vectorization width for outer-loop "
7324c9157d92SDimitry Andric           "vectorization cannot be used because the target does not support "
7325c9157d92SDimitry Andric           "scalable vectors.",
7326c9157d92SDimitry Andric           "ScalableVFUnfeasible", ORE, OrigLoop);
7327c9157d92SDimitry Andric       return VectorizationFactor::Disabled();
73280b57cec5SDimitry Andric     }
73290b57cec5SDimitry Andric     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7330e8d8bef9SDimitry Andric     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7331e8d8bef9SDimitry Andric            "VF needs to be a power of two");
7332e8d8bef9SDimitry Andric     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7333e8d8bef9SDimitry Andric                       << "VF " << VF << " to build VPlans.\n");
73340b57cec5SDimitry Andric     buildVPlans(VF, VF);
73350b57cec5SDimitry Andric 
73360b57cec5SDimitry Andric     // For VPlan build stress testing, we bail out after VPlan construction.
73370b57cec5SDimitry Andric     if (VPlanBuildStressTest)
73380b57cec5SDimitry Andric       return VectorizationFactor::Disabled();
73390b57cec5SDimitry Andric 
734081ad6265SDimitry Andric     return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
73410b57cec5SDimitry Andric   }
73420b57cec5SDimitry Andric 
73430b57cec5SDimitry Andric   LLVM_DEBUG(
73440b57cec5SDimitry Andric       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
73450b57cec5SDimitry Andric                 "VPlan-native path.\n");
73460b57cec5SDimitry Andric   return VectorizationFactor::Disabled();
73470b57cec5SDimitry Andric }
73480b57cec5SDimitry Andric 
7349bdd1243dSDimitry Andric std::optional<VectorizationFactor>
plan(ElementCount UserVF,unsigned UserIC)7350e8d8bef9SDimitry Andric LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7351e8d8bef9SDimitry Andric   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7352fe013be4SDimitry Andric   CM.collectValuesToIgnore();
7353fe013be4SDimitry Andric   CM.collectElementTypesForWidening();
7354fe013be4SDimitry Andric 
7355fe6060f1SDimitry Andric   FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7356fe6060f1SDimitry Andric   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7357bdd1243dSDimitry Andric     return std::nullopt;
73580b57cec5SDimitry Andric 
73590b57cec5SDimitry Andric   // Invalidate interleave groups if all blocks of loop will be predicated.
7360349cc55cSDimitry Andric   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7361fe013be4SDimitry Andric       !useMaskedInterleavedAccesses(TTI)) {
73620b57cec5SDimitry Andric     LLVM_DEBUG(
73630b57cec5SDimitry Andric         dbgs()
73640b57cec5SDimitry Andric         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
73650b57cec5SDimitry Andric            "which requires masked-interleaved support.\n");
73665ffd83dbSDimitry Andric     if (CM.InterleaveInfo.invalidateGroups())
73675ffd83dbSDimitry Andric       // Invalidating interleave groups also requires invalidating all decisions
73685ffd83dbSDimitry Andric       // based on them, which includes widening decisions and uniform and scalar
73695ffd83dbSDimitry Andric       // values.
73705ffd83dbSDimitry Andric       CM.invalidateCostModelingDecisions();
73710b57cec5SDimitry Andric   }
73720b57cec5SDimitry Andric 
7373fe6060f1SDimitry Andric   ElementCount MaxUserVF =
7374fe6060f1SDimitry Andric       UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7375fe6060f1SDimitry Andric   bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
7376fe6060f1SDimitry Andric   if (!UserVF.isZero() && UserVFIsLegal) {
7377fe6060f1SDimitry Andric     assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
7378e8d8bef9SDimitry Andric            "VF needs to be a power of two");
73790b57cec5SDimitry Andric     // Collect the instructions (and their associated costs) that will be more
73800b57cec5SDimitry Andric     // profitable to scalarize.
7381c9157d92SDimitry Andric     CM.collectInLoopReductions();
7382fe6060f1SDimitry Andric     if (CM.selectUserVectorizationFactor(UserVF)) {
7383fe6060f1SDimitry Andric       LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7384fe6060f1SDimitry Andric       buildVPlansWithVPRecipes(UserVF, UserVF);
7385fe013be4SDimitry Andric       if (!hasPlanWithVF(UserVF)) {
7386fe013be4SDimitry Andric         LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF
7387fe013be4SDimitry Andric                           << ".\n");
7388fe013be4SDimitry Andric         return std::nullopt;
7389fe013be4SDimitry Andric       }
7390fe013be4SDimitry Andric 
73910b57cec5SDimitry Andric       LLVM_DEBUG(printPlans(dbgs()));
739281ad6265SDimitry Andric       return {{UserVF, 0, 0}};
7393fe6060f1SDimitry Andric     } else
7394fe6060f1SDimitry Andric       reportVectorizationInfo("UserVF ignored because of invalid costs.",
7395fe6060f1SDimitry Andric                               "InvalidCost", ORE, OrigLoop);
73960b57cec5SDimitry Andric   }
73970b57cec5SDimitry Andric 
7398fe6060f1SDimitry Andric   // Populate the set of Vectorization Factor Candidates.
7399fe6060f1SDimitry Andric   ElementCountSet VFCandidates;
7400fe6060f1SDimitry Andric   for (auto VF = ElementCount::getFixed(1);
7401fe6060f1SDimitry Andric        ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7402fe6060f1SDimitry Andric     VFCandidates.insert(VF);
7403fe6060f1SDimitry Andric   for (auto VF = ElementCount::getScalable(1);
7404fe6060f1SDimitry Andric        ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7405fe6060f1SDimitry Andric     VFCandidates.insert(VF);
74060b57cec5SDimitry Andric 
7407c9157d92SDimitry Andric   CM.collectInLoopReductions();
7408fe6060f1SDimitry Andric   for (const auto &VF : VFCandidates) {
74090b57cec5SDimitry Andric     // Collect Uniform and Scalar instructions after vectorization with VF.
74100b57cec5SDimitry Andric     CM.collectUniformsAndScalars(VF);
74110b57cec5SDimitry Andric 
74120b57cec5SDimitry Andric     // Collect the instructions (and their associated costs) that will be more
74130b57cec5SDimitry Andric     // profitable to scalarize.
7414e8d8bef9SDimitry Andric     if (VF.isVector())
74150b57cec5SDimitry Andric       CM.collectInstsToScalarize(VF);
74160b57cec5SDimitry Andric   }
74170b57cec5SDimitry Andric 
7418fe6060f1SDimitry Andric   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7419fe6060f1SDimitry Andric   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7420e8d8bef9SDimitry Andric 
74210b57cec5SDimitry Andric   LLVM_DEBUG(printPlans(dbgs()));
7422fe6060f1SDimitry Andric   if (!MaxFactors.hasVector())
74230b57cec5SDimitry Andric     return VectorizationFactor::Disabled();
74240b57cec5SDimitry Andric 
74250b57cec5SDimitry Andric   // Select the optimal vectorization factor.
7426fe013be4SDimitry Andric   VectorizationFactor VF = selectVectorizationFactor(VFCandidates);
7427753f127fSDimitry Andric   assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero.");
7428fe013be4SDimitry Andric   if (!hasPlanWithVF(VF.Width)) {
7429fe013be4SDimitry Andric     LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width
7430fe013be4SDimitry Andric                       << ".\n");
7431fe013be4SDimitry Andric     return std::nullopt;
7432fe013be4SDimitry Andric   }
7433753f127fSDimitry Andric   return VF;
74340b57cec5SDimitry Andric }
74350b57cec5SDimitry Andric 
getBestPlanFor(ElementCount VF) const7436349cc55cSDimitry Andric VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
7437349cc55cSDimitry Andric   assert(count_if(VPlans,
7438349cc55cSDimitry Andric                   [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
7439349cc55cSDimitry Andric              1 &&
7440349cc55cSDimitry Andric          "Best VF has not a single VPlan.");
74410b57cec5SDimitry Andric 
7442349cc55cSDimitry Andric   for (const VPlanPtr &Plan : VPlans) {
7443349cc55cSDimitry Andric     if (Plan->hasVF(VF))
7444349cc55cSDimitry Andric       return *Plan.get();
7445349cc55cSDimitry Andric   }
7446349cc55cSDimitry Andric   llvm_unreachable("No plan found!");
74470b57cec5SDimitry Andric }
74480b57cec5SDimitry Andric 
AddRuntimeUnrollDisableMetaData(Loop * L)744904eeddc0SDimitry Andric static void AddRuntimeUnrollDisableMetaData(Loop *L) {
745004eeddc0SDimitry Andric   SmallVector<Metadata *, 4> MDs;
745104eeddc0SDimitry Andric   // Reserve first location for self reference to the LoopID metadata node.
745204eeddc0SDimitry Andric   MDs.push_back(nullptr);
745304eeddc0SDimitry Andric   bool IsUnrollMetadata = false;
745404eeddc0SDimitry Andric   MDNode *LoopID = L->getLoopID();
745504eeddc0SDimitry Andric   if (LoopID) {
745604eeddc0SDimitry Andric     // First find existing loop unrolling disable metadata.
745704eeddc0SDimitry Andric     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
745804eeddc0SDimitry Andric       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
745904eeddc0SDimitry Andric       if (MD) {
746004eeddc0SDimitry Andric         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
746104eeddc0SDimitry Andric         IsUnrollMetadata =
7462c9157d92SDimitry Andric             S && S->getString().starts_with("llvm.loop.unroll.disable");
746304eeddc0SDimitry Andric       }
746404eeddc0SDimitry Andric       MDs.push_back(LoopID->getOperand(i));
746504eeddc0SDimitry Andric     }
746604eeddc0SDimitry Andric   }
746704eeddc0SDimitry Andric 
746804eeddc0SDimitry Andric   if (!IsUnrollMetadata) {
746904eeddc0SDimitry Andric     // Add runtime unroll disable metadata.
747004eeddc0SDimitry Andric     LLVMContext &Context = L->getHeader()->getContext();
747104eeddc0SDimitry Andric     SmallVector<Metadata *, 1> DisableOperands;
747204eeddc0SDimitry Andric     DisableOperands.push_back(
747304eeddc0SDimitry Andric         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
747404eeddc0SDimitry Andric     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
747504eeddc0SDimitry Andric     MDs.push_back(DisableNode);
747604eeddc0SDimitry Andric     MDNode *NewLoopID = MDNode::get(Context, MDs);
747704eeddc0SDimitry Andric     // Set operand 0 to refer to the loop id itself.
747804eeddc0SDimitry Andric     NewLoopID->replaceOperandWith(0, NewLoopID);
747904eeddc0SDimitry Andric     L->setLoopID(NewLoopID);
748004eeddc0SDimitry Andric   }
748104eeddc0SDimitry Andric }
748204eeddc0SDimitry Andric 
7483cdc20ff6SDimitry Andric // Check if \p RedResult is a ComputeReductionResult instruction, and if it is
7484cdc20ff6SDimitry Andric // create a merge phi node for it and add it to \p ReductionResumeValues.
createAndCollectMergePhiForReduction(VPInstruction * RedResult,DenseMap<const RecurrenceDescriptor *,Value * > & ReductionResumeValues,VPTransformState & State,Loop * OrigLoop,BasicBlock * LoopMiddleBlock)7485cdc20ff6SDimitry Andric static void createAndCollectMergePhiForReduction(
7486cdc20ff6SDimitry Andric     VPInstruction *RedResult,
7487cdc20ff6SDimitry Andric     DenseMap<const RecurrenceDescriptor *, Value *> &ReductionResumeValues,
7488cdc20ff6SDimitry Andric     VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock) {
7489cdc20ff6SDimitry Andric   if (!RedResult ||
7490cdc20ff6SDimitry Andric       RedResult->getOpcode() != VPInstruction::ComputeReductionResult)
7491cdc20ff6SDimitry Andric     return;
7492cdc20ff6SDimitry Andric 
7493cdc20ff6SDimitry Andric   auto *PhiR = cast<VPReductionPHIRecipe>(RedResult->getOperand(0));
7494cdc20ff6SDimitry Andric   const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
7495cdc20ff6SDimitry Andric 
7496cdc20ff6SDimitry Andric   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
7497cdc20ff6SDimitry Andric   Value *FinalValue =
7498cdc20ff6SDimitry Andric       State.get(RedResult, VPIteration(State.UF - 1, VPLane::getFirstLane()));
7499cdc20ff6SDimitry Andric   auto *ResumePhi =
7500cdc20ff6SDimitry Andric       dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
7501cdc20ff6SDimitry Andric 
7502cdc20ff6SDimitry Andric   // TODO: bc.merge.rdx should not be created here, instead it should be
7503cdc20ff6SDimitry Andric   // modeled in VPlan.
7504cdc20ff6SDimitry Andric   BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader();
7505cdc20ff6SDimitry Andric   // Create a phi node that merges control-flow from the backedge-taken check
7506cdc20ff6SDimitry Andric   // block and the middle block.
7507cdc20ff6SDimitry Andric   auto *BCBlockPhi = PHINode::Create(FinalValue->getType(), 2, "bc.merge.rdx",
7508cdc20ff6SDimitry Andric                                      LoopScalarPreHeader->getTerminator());
7509cdc20ff6SDimitry Andric 
7510cdc20ff6SDimitry Andric   // If we are fixing reductions in the epilogue loop then we should already
7511cdc20ff6SDimitry Andric   // have created a bc.merge.rdx Phi after the main vector body. Ensure that
7512cdc20ff6SDimitry Andric   // we carry over the incoming values correctly.
7513cdc20ff6SDimitry Andric   for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
7514cdc20ff6SDimitry Andric     if (Incoming == LoopMiddleBlock)
7515cdc20ff6SDimitry Andric       BCBlockPhi->addIncoming(FinalValue, Incoming);
7516cdc20ff6SDimitry Andric     else if (ResumePhi && is_contained(ResumePhi->blocks(), Incoming))
7517cdc20ff6SDimitry Andric       BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
7518cdc20ff6SDimitry Andric                               Incoming);
7519cdc20ff6SDimitry Andric     else
7520cdc20ff6SDimitry Andric       BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
7521cdc20ff6SDimitry Andric   }
7522cdc20ff6SDimitry Andric 
7523cdc20ff6SDimitry Andric   auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
7524cdc20ff6SDimitry Andric   // TODO: This fixup should instead be modeled in VPlan.
7525cdc20ff6SDimitry Andric   // Fix the scalar loop reduction variable with the incoming reduction sum
7526cdc20ff6SDimitry Andric   // from the vector body and from the backedge value.
7527cdc20ff6SDimitry Andric   int IncomingEdgeBlockIdx =
7528cdc20ff6SDimitry Andric       OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
7529cdc20ff6SDimitry Andric   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
7530cdc20ff6SDimitry Andric   // Pick the other block.
7531cdc20ff6SDimitry Andric   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
7532cdc20ff6SDimitry Andric   OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
7533cdc20ff6SDimitry Andric   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
7534cdc20ff6SDimitry Andric   OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
7535cdc20ff6SDimitry Andric 
7536cdc20ff6SDimitry Andric   ReductionResumeValues[&RdxDesc] = BCBlockPhi;
7537cdc20ff6SDimitry Andric }
7538cdc20ff6SDimitry Andric 
7539cdc20ff6SDimitry Andric std::pair<DenseMap<const SCEV *, Value *>,
7540cdc20ff6SDimitry Andric           DenseMap<const RecurrenceDescriptor *, Value *>>
executePlan(ElementCount BestVF,unsigned BestUF,VPlan & BestVPlan,InnerLoopVectorizer & ILV,DominatorTree * DT,bool IsEpilogueVectorization,const DenseMap<const SCEV *,Value * > * ExpandedSCEVs)7541cdc20ff6SDimitry Andric LoopVectorizationPlanner::executePlan(
7542fe013be4SDimitry Andric     ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7543fe013be4SDimitry Andric     InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization,
7544c9157d92SDimitry Andric     const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {
7545bdd1243dSDimitry Andric   assert(BestVPlan.hasVF(BestVF) &&
7546bdd1243dSDimitry Andric          "Trying to execute plan with unsupported VF");
7547bdd1243dSDimitry Andric   assert(BestVPlan.hasUF(BestUF) &&
7548bdd1243dSDimitry Andric          "Trying to execute plan with unsupported UF");
7549fe013be4SDimitry Andric   assert(
7550fe013be4SDimitry Andric       (IsEpilogueVectorization || !ExpandedSCEVs) &&
7551fe013be4SDimitry Andric       "expanded SCEVs to reuse can only be used during epilogue vectorization");
7552bdd1243dSDimitry Andric 
7553349cc55cSDimitry Andric   LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
7554349cc55cSDimitry Andric                     << '\n');
7555349cc55cSDimitry Andric 
7556bdd1243dSDimitry Andric   if (!IsEpilogueVectorization)
7557bdd1243dSDimitry Andric     VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
7558bdd1243dSDimitry Andric 
75590b57cec5SDimitry Andric   // Perform the actual loop transformation.
7560c9157d92SDimitry Andric   VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan,
7561c9157d92SDimitry Andric                          OrigLoop->getHeader()->getContext());
7562fe013be4SDimitry Andric 
7563fe013be4SDimitry Andric   // 0. Generate SCEV-dependent code into the preheader, including TripCount,
7564fe013be4SDimitry Andric   // before making any changes to the CFG.
7565fe013be4SDimitry Andric   if (!BestVPlan.getPreheader()->empty()) {
7566fe013be4SDimitry Andric     State.CFG.PrevBB = OrigLoop->getLoopPreheader();
7567fe013be4SDimitry Andric     State.Builder.SetInsertPoint(OrigLoop->getLoopPreheader()->getTerminator());
7568fe013be4SDimitry Andric     BestVPlan.getPreheader()->execute(&State);
7569fe013be4SDimitry Andric   }
7570fe013be4SDimitry Andric   if (!ILV.getTripCount())
7571fe013be4SDimitry Andric     ILV.setTripCount(State.get(BestVPlan.getTripCount(), {0, 0}));
7572fe013be4SDimitry Andric   else
7573fe013be4SDimitry Andric     assert(IsEpilogueVectorization && "should only re-use the existing trip "
7574fe013be4SDimitry Andric                                       "count during epilogue vectorization");
75750b57cec5SDimitry Andric 
757681ad6265SDimitry Andric   // 1. Set up the skeleton for vectorization, including vector pre-header and
757781ad6265SDimitry Andric   // middle block. The vector loop is created during VPlan execution.
757804eeddc0SDimitry Andric   Value *CanonicalIVStartValue;
757904eeddc0SDimitry Andric   std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
7580fe013be4SDimitry Andric       ILV.createVectorizedLoopSkeleton(ExpandedSCEVs ? *ExpandedSCEVs
7581fe013be4SDimitry Andric                                                      : State.ExpandedSCEVs);
758281ad6265SDimitry Andric 
758381ad6265SDimitry Andric   // Only use noalias metadata when using memory checks guaranteeing no overlap
758481ad6265SDimitry Andric   // across all iterations.
758581ad6265SDimitry Andric   const LoopAccessInfo *LAI = ILV.Legal->getLAI();
7586fe013be4SDimitry Andric   std::unique_ptr<LoopVersioning> LVer = nullptr;
758781ad6265SDimitry Andric   if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
758881ad6265SDimitry Andric       !LAI->getRuntimePointerChecking()->getDiffChecks()) {
758981ad6265SDimitry Andric 
759081ad6265SDimitry Andric     //  We currently don't use LoopVersioning for the actual loop cloning but we
759181ad6265SDimitry Andric     //  still use it to add the noalias metadata.
759281ad6265SDimitry Andric     //  TODO: Find a better way to re-use LoopVersioning functionality to add
759381ad6265SDimitry Andric     //        metadata.
7594fe013be4SDimitry Andric     LVer = std::make_unique<LoopVersioning>(
759581ad6265SDimitry Andric         *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT,
759681ad6265SDimitry Andric         PSE.getSE());
7597fe013be4SDimitry Andric     State.LVer = &*LVer;
759881ad6265SDimitry Andric     State.LVer->prepareNoAliasMetadata();
759981ad6265SDimitry Andric   }
760081ad6265SDimitry Andric 
76014824e7fdSDimitry Andric   ILV.collectPoisonGeneratingRecipes(State);
76020b57cec5SDimitry Andric 
7603e8d8bef9SDimitry Andric   ILV.printDebugTracesAtStart();
7604e8d8bef9SDimitry Andric 
76050b57cec5SDimitry Andric   //===------------------------------------------------===//
76060b57cec5SDimitry Andric   //
76070b57cec5SDimitry Andric   // Notice: any optimization or new instruction that go
76080b57cec5SDimitry Andric   // into the code below should also be implemented in
76090b57cec5SDimitry Andric   // the cost-model.
76100b57cec5SDimitry Andric   //
76110b57cec5SDimitry Andric   //===------------------------------------------------===//
76120b57cec5SDimitry Andric 
76130b57cec5SDimitry Andric   // 2. Copy and widen instructions from the old loop into the new loop.
7614c9157d92SDimitry Andric   BestVPlan.prepareToExecute(ILV.getTripCount(),
7615c9157d92SDimitry Andric                              ILV.getOrCreateVectorTripCount(nullptr),
7616c9157d92SDimitry Andric                              CanonicalIVStartValue, State);
761781ad6265SDimitry Andric 
7618349cc55cSDimitry Andric   BestVPlan.execute(&State);
76190b57cec5SDimitry Andric 
7620cdc20ff6SDimitry Andric   // 2.5 Collect reduction resume values.
7621cdc20ff6SDimitry Andric   DenseMap<const RecurrenceDescriptor *, Value *> ReductionResumeValues;
7622cdc20ff6SDimitry Andric   auto *ExitVPBB =
7623cdc20ff6SDimitry Andric       cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
7624cdc20ff6SDimitry Andric   for (VPRecipeBase &R : *ExitVPBB) {
7625cdc20ff6SDimitry Andric     createAndCollectMergePhiForReduction(dyn_cast<VPInstruction>(&R),
7626cdc20ff6SDimitry Andric                                          ReductionResumeValues, State, OrigLoop,
7627cdc20ff6SDimitry Andric                                          State.CFG.VPBB2IRBB[ExitVPBB]);
7628cdc20ff6SDimitry Andric   }
7629cdc20ff6SDimitry Andric 
7630cdc20ff6SDimitry Andric   // 2.6. Maintain Loop Hints
763104eeddc0SDimitry Andric   // Keep all loop hints from the original loop on the vector loop (we'll
763204eeddc0SDimitry Andric   // replace the vectorizer-specific hints below).
763304eeddc0SDimitry Andric   MDNode *OrigLoopID = OrigLoop->getLoopID();
763404eeddc0SDimitry Andric 
7635bdd1243dSDimitry Andric   std::optional<MDNode *> VectorizedLoopID =
763604eeddc0SDimitry Andric       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
763704eeddc0SDimitry Andric                                       LLVMLoopVectorizeFollowupVectorized});
763804eeddc0SDimitry Andric 
763981ad6265SDimitry Andric   VPBasicBlock *HeaderVPBB =
764081ad6265SDimitry Andric       BestVPlan.getVectorLoopRegion()->getEntryBasicBlock();
764181ad6265SDimitry Andric   Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
764281ad6265SDimitry Andric   if (VectorizedLoopID)
7643bdd1243dSDimitry Andric     L->setLoopID(*VectorizedLoopID);
764404eeddc0SDimitry Andric   else {
764504eeddc0SDimitry Andric     // Keep all loop hints from the original loop on the vector loop (we'll
764604eeddc0SDimitry Andric     // replace the vectorizer-specific hints below).
764704eeddc0SDimitry Andric     if (MDNode *LID = OrigLoop->getLoopID())
764804eeddc0SDimitry Andric       L->setLoopID(LID);
764904eeddc0SDimitry Andric 
765004eeddc0SDimitry Andric     LoopVectorizeHints Hints(L, true, *ORE);
765104eeddc0SDimitry Andric     Hints.setAlreadyVectorized();
765204eeddc0SDimitry Andric   }
7653fe013be4SDimitry Andric   TargetTransformInfo::UnrollingPreferences UP;
7654fe013be4SDimitry Andric   TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
7655fe013be4SDimitry Andric   if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue)
765604eeddc0SDimitry Andric     AddRuntimeUnrollDisableMetaData(L);
765704eeddc0SDimitry Andric 
76580b57cec5SDimitry Andric   // 3. Fix the vectorized code: take care of header phi's, live-outs,
76590b57cec5SDimitry Andric   //    predication, updating analyses.
766081ad6265SDimitry Andric   ILV.fixVectorizedLoop(State, BestVPlan);
7661e8d8bef9SDimitry Andric 
7662e8d8bef9SDimitry Andric   ILV.printDebugTracesAtEnd();
7663fe013be4SDimitry Andric 
7664cdc20ff6SDimitry Andric   return {State.ExpandedSCEVs, ReductionResumeValues};
76650b57cec5SDimitry Andric }
76660b57cec5SDimitry Andric 
7667fe6060f1SDimitry Andric #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
printPlans(raw_ostream & O)7668fe6060f1SDimitry Andric void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
7669fe6060f1SDimitry Andric   for (const auto &Plan : VPlans)
7670fe6060f1SDimitry Andric     if (PrintVPlansInDotFormat)
7671fe6060f1SDimitry Andric       Plan->printDOT(O);
7672fe6060f1SDimitry Andric     else
7673fe6060f1SDimitry Andric       Plan->print(O);
7674fe6060f1SDimitry Andric }
7675fe6060f1SDimitry Andric #endif
7676fe6060f1SDimitry Andric 
7677e8d8bef9SDimitry Andric //===--------------------------------------------------------------------===//
7678e8d8bef9SDimitry Andric // EpilogueVectorizerMainLoop
7679e8d8bef9SDimitry Andric //===--------------------------------------------------------------------===//
7680e8d8bef9SDimitry Andric 
7681e8d8bef9SDimitry Andric /// This function is partially responsible for generating the control flow
7682e8d8bef9SDimitry Andric /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
768304eeddc0SDimitry Andric std::pair<BasicBlock *, Value *>
createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy & ExpandedSCEVs)7684fe013be4SDimitry Andric EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
7685fe013be4SDimitry Andric     const SCEV2ValueTy &ExpandedSCEVs) {
768681ad6265SDimitry Andric   createVectorLoopSkeleton("");
7687e8d8bef9SDimitry Andric 
7688e8d8bef9SDimitry Andric   // Generate the code to check the minimum iteration count of the vector
7689e8d8bef9SDimitry Andric   // epilogue (see below).
7690e8d8bef9SDimitry Andric   EPI.EpilogueIterationCountCheck =
769181ad6265SDimitry Andric       emitIterationCountCheck(LoopScalarPreHeader, true);
7692e8d8bef9SDimitry Andric   EPI.EpilogueIterationCountCheck->setName("iter.check");
7693e8d8bef9SDimitry Andric 
7694e8d8bef9SDimitry Andric   // Generate the code to check any assumptions that we've made for SCEV
7695e8d8bef9SDimitry Andric   // expressions.
769681ad6265SDimitry Andric   EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader);
7697e8d8bef9SDimitry Andric 
7698e8d8bef9SDimitry Andric   // Generate the code that checks at runtime if arrays overlap. We put the
7699e8d8bef9SDimitry Andric   // checks into a separate block to make the more common case of few elements
7700e8d8bef9SDimitry Andric   // faster.
770181ad6265SDimitry Andric   EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader);
7702e8d8bef9SDimitry Andric 
7703e8d8bef9SDimitry Andric   // Generate the iteration count check for the main loop, *after* the check
7704e8d8bef9SDimitry Andric   // for the epilogue loop, so that the path-length is shorter for the case
7705e8d8bef9SDimitry Andric   // that goes directly through the vector epilogue. The longer-path length for
7706e8d8bef9SDimitry Andric   // the main loop is compensated for, by the gain from vectorizing the larger
7707e8d8bef9SDimitry Andric   // trip count. Note: the branch will get updated later on when we vectorize
7708e8d8bef9SDimitry Andric   // the epilogue.
7709e8d8bef9SDimitry Andric   EPI.MainLoopIterationCountCheck =
771081ad6265SDimitry Andric       emitIterationCountCheck(LoopScalarPreHeader, false);
7711e8d8bef9SDimitry Andric 
7712e8d8bef9SDimitry Andric   // Generate the induction variable.
771381ad6265SDimitry Andric   EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
7714e8d8bef9SDimitry Andric 
7715e8d8bef9SDimitry Andric   // Skip induction resume value creation here because they will be created in
7716bdd1243dSDimitry Andric   // the second pass for the scalar loop. The induction resume values for the
7717bdd1243dSDimitry Andric   // inductions in the epilogue loop are created before executing the plan for
7718bdd1243dSDimitry Andric   // the epilogue loop.
7719e8d8bef9SDimitry Andric 
7720bdd1243dSDimitry Andric   return {completeLoopSkeleton(), nullptr};
7721e8d8bef9SDimitry Andric }
7722e8d8bef9SDimitry Andric 
printDebugTracesAtStart()7723e8d8bef9SDimitry Andric void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7724e8d8bef9SDimitry Andric   LLVM_DEBUG({
7725e8d8bef9SDimitry Andric     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7726349cc55cSDimitry Andric            << "Main Loop VF:" << EPI.MainLoopVF
7727e8d8bef9SDimitry Andric            << ", Main Loop UF:" << EPI.MainLoopUF
7728349cc55cSDimitry Andric            << ", Epilogue Loop VF:" << EPI.EpilogueVF
7729e8d8bef9SDimitry Andric            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7730e8d8bef9SDimitry Andric   });
7731e8d8bef9SDimitry Andric }
7732e8d8bef9SDimitry Andric 
printDebugTracesAtEnd()7733e8d8bef9SDimitry Andric void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7734e8d8bef9SDimitry Andric   DEBUG_WITH_TYPE(VerboseDebug, {
77354824e7fdSDimitry Andric     dbgs() << "intermediate fn:\n"
77364824e7fdSDimitry Andric            << *OrigLoop->getHeader()->getParent() << "\n";
7737e8d8bef9SDimitry Andric   });
7738e8d8bef9SDimitry Andric }
7739e8d8bef9SDimitry Andric 
774081ad6265SDimitry Andric BasicBlock *
emitIterationCountCheck(BasicBlock * Bypass,bool ForEpilogue)774181ad6265SDimitry Andric EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
774281ad6265SDimitry Andric                                                     bool ForEpilogue) {
7743e8d8bef9SDimitry Andric   assert(Bypass && "Expected valid bypass basic block.");
7744349cc55cSDimitry Andric   ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7745e8d8bef9SDimitry Andric   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7746fe013be4SDimitry Andric   Value *Count = getTripCount();
7747e8d8bef9SDimitry Andric   // Reuse existing vector loop preheader for TC checks.
7748e8d8bef9SDimitry Andric   // Note that new preheader block is generated for vector loop.
7749e8d8bef9SDimitry Andric   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7750e8d8bef9SDimitry Andric   IRBuilder<> Builder(TCCheckBlock->getTerminator());
7751e8d8bef9SDimitry Andric 
7752e8d8bef9SDimitry Andric   // Generate code to check if the loop's trip count is less than VF * UF of the
7753e8d8bef9SDimitry Andric   // main vector loop.
7754fe013be4SDimitry Andric   auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector()
7755fe013be4SDimitry Andric                                                     : VF.isVector())
7756fe013be4SDimitry Andric                ? ICmpInst::ICMP_ULE
7757fe013be4SDimitry Andric                : ICmpInst::ICMP_ULT;
7758e8d8bef9SDimitry Andric 
7759e8d8bef9SDimitry Andric   Value *CheckMinIters = Builder.CreateICmp(
7760349cc55cSDimitry Andric       P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
7761e8d8bef9SDimitry Andric       "min.iters.check");
7762e8d8bef9SDimitry Andric 
7763e8d8bef9SDimitry Andric   if (!ForEpilogue)
7764e8d8bef9SDimitry Andric     TCCheckBlock->setName("vector.main.loop.iter.check");
7765e8d8bef9SDimitry Andric 
7766e8d8bef9SDimitry Andric   // Create new preheader for vector loop.
7767e8d8bef9SDimitry Andric   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7768e8d8bef9SDimitry Andric                                    DT, LI, nullptr, "vector.ph");
7769e8d8bef9SDimitry Andric 
7770e8d8bef9SDimitry Andric   if (ForEpilogue) {
7771e8d8bef9SDimitry Andric     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7772e8d8bef9SDimitry Andric                                  DT->getNode(Bypass)->getIDom()) &&
7773e8d8bef9SDimitry Andric            "TC check is expected to dominate Bypass");
7774e8d8bef9SDimitry Andric 
7775e8d8bef9SDimitry Andric     // Update dominator for Bypass & LoopExit.
7776e8d8bef9SDimitry Andric     DT->changeImmediateDominator(Bypass, TCCheckBlock);
7777fe013be4SDimitry Andric     if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))
7778fe6060f1SDimitry Andric       // For loops with multiple exits, there's no edge from the middle block
7779fe6060f1SDimitry Andric       // to exit blocks (as the epilogue must run) and thus no need to update
7780fe6060f1SDimitry Andric       // the immediate dominator of the exit blocks.
7781e8d8bef9SDimitry Andric       DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
7782e8d8bef9SDimitry Andric 
7783e8d8bef9SDimitry Andric     LoopBypassBlocks.push_back(TCCheckBlock);
7784e8d8bef9SDimitry Andric 
7785e8d8bef9SDimitry Andric     // Save the trip count so we don't have to regenerate it in the
7786e8d8bef9SDimitry Andric     // vec.epilog.iter.check. This is safe to do because the trip count
7787e8d8bef9SDimitry Andric     // generated here dominates the vector epilog iter check.
7788e8d8bef9SDimitry Andric     EPI.TripCount = Count;
7789e8d8bef9SDimitry Andric   }
7790e8d8bef9SDimitry Andric 
7791c9157d92SDimitry Andric   BranchInst &BI =
7792c9157d92SDimitry Andric       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7793c9157d92SDimitry Andric   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
7794c9157d92SDimitry Andric     setBranchWeights(BI, MinItersBypassWeights);
7795c9157d92SDimitry Andric   ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
7796e8d8bef9SDimitry Andric 
7797e8d8bef9SDimitry Andric   return TCCheckBlock;
7798e8d8bef9SDimitry Andric }
7799e8d8bef9SDimitry Andric 
7800e8d8bef9SDimitry Andric //===--------------------------------------------------------------------===//
7801e8d8bef9SDimitry Andric // EpilogueVectorizerEpilogueLoop
7802e8d8bef9SDimitry Andric //===--------------------------------------------------------------------===//
7803e8d8bef9SDimitry Andric 
7804e8d8bef9SDimitry Andric /// This function is partially responsible for generating the control flow
7805e8d8bef9SDimitry Andric /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
780604eeddc0SDimitry Andric std::pair<BasicBlock *, Value *>
createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy & ExpandedSCEVs)7807fe013be4SDimitry Andric EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
7808fe013be4SDimitry Andric     const SCEV2ValueTy &ExpandedSCEVs) {
780981ad6265SDimitry Andric   createVectorLoopSkeleton("vec.epilog.");
7810e8d8bef9SDimitry Andric 
7811e8d8bef9SDimitry Andric   // Now, compare the remaining count and if there aren't enough iterations to
7812e8d8bef9SDimitry Andric   // execute the vectorized epilogue skip to the scalar part.
7813e8d8bef9SDimitry Andric   BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
7814e8d8bef9SDimitry Andric   VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
7815e8d8bef9SDimitry Andric   LoopVectorPreHeader =
7816e8d8bef9SDimitry Andric       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
7817e8d8bef9SDimitry Andric                  LI, nullptr, "vec.epilog.ph");
781881ad6265SDimitry Andric   emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
7819e8d8bef9SDimitry Andric                                           VecEpilogueIterationCountCheck);
7820e8d8bef9SDimitry Andric 
7821e8d8bef9SDimitry Andric   // Adjust the control flow taking the state info from the main loop
7822e8d8bef9SDimitry Andric   // vectorization into account.
7823e8d8bef9SDimitry Andric   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7824e8d8bef9SDimitry Andric          "expected this to be saved from the previous pass.");
7825e8d8bef9SDimitry Andric   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7826e8d8bef9SDimitry Andric       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7827e8d8bef9SDimitry Andric 
7828e8d8bef9SDimitry Andric   DT->changeImmediateDominator(LoopVectorPreHeader,
7829e8d8bef9SDimitry Andric                                EPI.MainLoopIterationCountCheck);
7830e8d8bef9SDimitry Andric 
7831e8d8bef9SDimitry Andric   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7832e8d8bef9SDimitry Andric       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7833e8d8bef9SDimitry Andric 
7834e8d8bef9SDimitry Andric   if (EPI.SCEVSafetyCheck)
7835e8d8bef9SDimitry Andric     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
7836e8d8bef9SDimitry Andric         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7837e8d8bef9SDimitry Andric   if (EPI.MemSafetyCheck)
7838e8d8bef9SDimitry Andric     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
7839e8d8bef9SDimitry Andric         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7840e8d8bef9SDimitry Andric 
7841e8d8bef9SDimitry Andric   DT->changeImmediateDominator(
7842e8d8bef9SDimitry Andric       VecEpilogueIterationCountCheck,
7843e8d8bef9SDimitry Andric       VecEpilogueIterationCountCheck->getSinglePredecessor());
7844e8d8bef9SDimitry Andric 
7845e8d8bef9SDimitry Andric   DT->changeImmediateDominator(LoopScalarPreHeader,
7846e8d8bef9SDimitry Andric                                EPI.EpilogueIterationCountCheck);
7847fe013be4SDimitry Andric   if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))
7848fe6060f1SDimitry Andric     // If there is an epilogue which must run, there's no edge from the
7849fe6060f1SDimitry Andric     // middle block to exit blocks  and thus no need to update the immediate
7850fe6060f1SDimitry Andric     // dominator of the exit blocks.
7851fe6060f1SDimitry Andric     DT->changeImmediateDominator(LoopExitBlock,
7852fe6060f1SDimitry Andric                                  EPI.EpilogueIterationCountCheck);
7853e8d8bef9SDimitry Andric 
7854bdd1243dSDimitry Andric   // Keep track of bypass blocks, as they feed start values to the induction and
7855bdd1243dSDimitry Andric   // reduction phis in the scalar loop preheader.
7856e8d8bef9SDimitry Andric   if (EPI.SCEVSafetyCheck)
7857e8d8bef9SDimitry Andric     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
7858e8d8bef9SDimitry Andric   if (EPI.MemSafetyCheck)
7859e8d8bef9SDimitry Andric     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
7860e8d8bef9SDimitry Andric   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
7861e8d8bef9SDimitry Andric 
7862bdd1243dSDimitry Andric   // The vec.epilog.iter.check block may contain Phi nodes from inductions or
7863bdd1243dSDimitry Andric   // reductions which merge control-flow from the latch block and the middle
7864bdd1243dSDimitry Andric   // block. Update the incoming values here and move the Phi into the preheader.
786504eeddc0SDimitry Andric   SmallVector<PHINode *, 4> PhisInBlock;
786604eeddc0SDimitry Andric   for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
786704eeddc0SDimitry Andric     PhisInBlock.push_back(&Phi);
786804eeddc0SDimitry Andric 
786904eeddc0SDimitry Andric   for (PHINode *Phi : PhisInBlock) {
7870bdd1243dSDimitry Andric     Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
787104eeddc0SDimitry Andric     Phi->replaceIncomingBlockWith(
787204eeddc0SDimitry Andric         VecEpilogueIterationCountCheck->getSinglePredecessor(),
787304eeddc0SDimitry Andric         VecEpilogueIterationCountCheck);
7874bdd1243dSDimitry Andric 
7875bdd1243dSDimitry Andric     // If the phi doesn't have an incoming value from the
7876bdd1243dSDimitry Andric     // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
7877bdd1243dSDimitry Andric     // value and also those from other check blocks. This is needed for
7878bdd1243dSDimitry Andric     // reduction phis only.
7879bdd1243dSDimitry Andric     if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
7880bdd1243dSDimitry Andric           return EPI.EpilogueIterationCountCheck == IncB;
7881bdd1243dSDimitry Andric         }))
7882bdd1243dSDimitry Andric       continue;
788304eeddc0SDimitry Andric     Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
788404eeddc0SDimitry Andric     if (EPI.SCEVSafetyCheck)
788504eeddc0SDimitry Andric       Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
788604eeddc0SDimitry Andric     if (EPI.MemSafetyCheck)
788704eeddc0SDimitry Andric       Phi->removeIncomingValue(EPI.MemSafetyCheck);
788804eeddc0SDimitry Andric   }
788904eeddc0SDimitry Andric 
7890e8d8bef9SDimitry Andric   // Generate a resume induction for the vector epilogue and put it in the
7891e8d8bef9SDimitry Andric   // vector epilogue preheader
7892e8d8bef9SDimitry Andric   Type *IdxTy = Legal->getWidestInductionType();
7893c9157d92SDimitry Andric   PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val");
7894c9157d92SDimitry Andric   EPResumeVal->insertBefore(LoopVectorPreHeader->getFirstNonPHIIt());
7895e8d8bef9SDimitry Andric   EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
7896e8d8bef9SDimitry Andric   EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
7897e8d8bef9SDimitry Andric                            EPI.MainLoopIterationCountCheck);
7898e8d8bef9SDimitry Andric 
7899e8d8bef9SDimitry Andric   // Generate induction resume values. These variables save the new starting
7900e8d8bef9SDimitry Andric   // indexes for the scalar loop. They are used to test if there are any tail
7901e8d8bef9SDimitry Andric   // iterations left once the vector loop has completed.
7902e8d8bef9SDimitry Andric   // Note that when the vectorized epilogue is skipped due to iteration count
7903e8d8bef9SDimitry Andric   // check, then the resume value for the induction variable comes from
7904e8d8bef9SDimitry Andric   // the trip count of the main vector loop, hence passing the AdditionalBypass
7905e8d8bef9SDimitry Andric   // argument.
7906fe013be4SDimitry Andric   createInductionResumeValues(ExpandedSCEVs,
7907fe013be4SDimitry Andric                               {VecEpilogueIterationCountCheck,
7908e8d8bef9SDimitry Andric                                EPI.VectorTripCount} /* AdditionalBypass */);
7909e8d8bef9SDimitry Andric 
7910bdd1243dSDimitry Andric   return {completeLoopSkeleton(), EPResumeVal};
7911e8d8bef9SDimitry Andric }
7912e8d8bef9SDimitry Andric 
7913e8d8bef9SDimitry Andric BasicBlock *
emitMinimumVectorEpilogueIterCountCheck(BasicBlock * Bypass,BasicBlock * Insert)7914e8d8bef9SDimitry Andric EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
791581ad6265SDimitry Andric     BasicBlock *Bypass, BasicBlock *Insert) {
7916e8d8bef9SDimitry Andric 
7917e8d8bef9SDimitry Andric   assert(EPI.TripCount &&
7918e8d8bef9SDimitry Andric          "Expected trip count to have been safed in the first pass.");
7919e8d8bef9SDimitry Andric   assert(
7920e8d8bef9SDimitry Andric       (!isa<Instruction>(EPI.TripCount) ||
7921e8d8bef9SDimitry Andric        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
7922e8d8bef9SDimitry Andric       "saved trip count does not dominate insertion point.");
7923e8d8bef9SDimitry Andric   Value *TC = EPI.TripCount;
7924e8d8bef9SDimitry Andric   IRBuilder<> Builder(Insert->getTerminator());
7925e8d8bef9SDimitry Andric   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
7926e8d8bef9SDimitry Andric 
7927e8d8bef9SDimitry Andric   // Generate code to check if the loop's trip count is less than VF * UF of the
7928e8d8bef9SDimitry Andric   // vector epilogue loop.
7929fe013be4SDimitry Andric   auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())
7930fe013be4SDimitry Andric                ? ICmpInst::ICMP_ULE
7931fe013be4SDimitry Andric                : ICmpInst::ICMP_ULT;
7932e8d8bef9SDimitry Andric 
7933349cc55cSDimitry Andric   Value *CheckMinIters =
7934349cc55cSDimitry Andric       Builder.CreateICmp(P, Count,
7935349cc55cSDimitry Andric                          createStepForVF(Builder, Count->getType(),
7936349cc55cSDimitry Andric                                          EPI.EpilogueVF, EPI.EpilogueUF),
7937e8d8bef9SDimitry Andric                          "min.epilog.iters.check");
7938e8d8bef9SDimitry Andric 
7939c9157d92SDimitry Andric   BranchInst &BI =
7940c9157d92SDimitry Andric       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7941c9157d92SDimitry Andric   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
7942c9157d92SDimitry Andric     unsigned MainLoopStep = UF * VF.getKnownMinValue();
7943c9157d92SDimitry Andric     unsigned EpilogueLoopStep =
7944c9157d92SDimitry Andric         EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue();
7945c9157d92SDimitry Andric     // We assume the remaining `Count` is equally distributed in
7946c9157d92SDimitry Andric     // [0, MainLoopStep)
7947c9157d92SDimitry Andric     // So the probability for `Count < EpilogueLoopStep` should be
7948c9157d92SDimitry Andric     // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
7949c9157d92SDimitry Andric     unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
7950c9157d92SDimitry Andric     const uint32_t Weights[] = {EstimatedSkipCount,
7951c9157d92SDimitry Andric                                 MainLoopStep - EstimatedSkipCount};
7952c9157d92SDimitry Andric     setBranchWeights(BI, Weights);
7953c9157d92SDimitry Andric   }
7954c9157d92SDimitry Andric   ReplaceInstWithInst(Insert->getTerminator(), &BI);
7955e8d8bef9SDimitry Andric 
7956e8d8bef9SDimitry Andric   LoopBypassBlocks.push_back(Insert);
7957e8d8bef9SDimitry Andric   return Insert;
7958e8d8bef9SDimitry Andric }
7959e8d8bef9SDimitry Andric 
printDebugTracesAtStart()7960e8d8bef9SDimitry Andric void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
7961e8d8bef9SDimitry Andric   LLVM_DEBUG({
7962e8d8bef9SDimitry Andric     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7963349cc55cSDimitry Andric            << "Epilogue Loop VF:" << EPI.EpilogueVF
7964e8d8bef9SDimitry Andric            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7965e8d8bef9SDimitry Andric   });
7966e8d8bef9SDimitry Andric }
7967e8d8bef9SDimitry Andric 
printDebugTracesAtEnd()7968e8d8bef9SDimitry Andric void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
7969e8d8bef9SDimitry Andric   DEBUG_WITH_TYPE(VerboseDebug, {
79704824e7fdSDimitry Andric     dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
7971e8d8bef9SDimitry Andric   });
7972e8d8bef9SDimitry Andric }
7973e8d8bef9SDimitry Andric 
getDecisionAndClampRange(const std::function<bool (ElementCount)> & Predicate,VFRange & Range)79740b57cec5SDimitry Andric bool LoopVectorizationPlanner::getDecisionAndClampRange(
7975e8d8bef9SDimitry Andric     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
7976e8d8bef9SDimitry Andric   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
79770b57cec5SDimitry Andric   bool PredicateAtRangeStart = Predicate(Range.Start);
79780b57cec5SDimitry Andric 
7979fe013be4SDimitry Andric   for (ElementCount TmpVF : VFRange(Range.Start * 2, Range.End))
79800b57cec5SDimitry Andric     if (Predicate(TmpVF) != PredicateAtRangeStart) {
79810b57cec5SDimitry Andric       Range.End = TmpVF;
79820b57cec5SDimitry Andric       break;
79830b57cec5SDimitry Andric     }
79840b57cec5SDimitry Andric 
79850b57cec5SDimitry Andric   return PredicateAtRangeStart;
79860b57cec5SDimitry Andric }
79870b57cec5SDimitry Andric 
79880b57cec5SDimitry Andric /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
79890b57cec5SDimitry Andric /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
79900b57cec5SDimitry Andric /// of VF's starting at a given VF and extending it as much as possible. Each
79910b57cec5SDimitry Andric /// vectorization decision can potentially shorten this sub-range during
79920b57cec5SDimitry Andric /// buildVPlan().
buildVPlans(ElementCount MinVF,ElementCount MaxVF)7993e8d8bef9SDimitry Andric void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
7994e8d8bef9SDimitry Andric                                            ElementCount MaxVF) {
7995fe013be4SDimitry Andric   auto MaxVFTimes2 = MaxVF * 2;
7996fe013be4SDimitry Andric   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
7997fe013be4SDimitry Andric     VFRange SubRange = {VF, MaxVFTimes2};
79980b57cec5SDimitry Andric     VPlans.push_back(buildVPlan(SubRange));
79990b57cec5SDimitry Andric     VF = SubRange.End;
80000b57cec5SDimitry Andric   }
80010b57cec5SDimitry Andric }
80020b57cec5SDimitry Andric 
createEdgeMask(BasicBlock * Src,BasicBlock * Dst,VPlan & Plan)80030b57cec5SDimitry Andric VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
8004fe013be4SDimitry Andric                                          VPlan &Plan) {
80050b57cec5SDimitry Andric   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
80060b57cec5SDimitry Andric 
80070b57cec5SDimitry Andric   // Look for cached value.
80080b57cec5SDimitry Andric   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
80090b57cec5SDimitry Andric   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
80100b57cec5SDimitry Andric   if (ECEntryIt != EdgeMaskCache.end())
80110b57cec5SDimitry Andric     return ECEntryIt->second;
80120b57cec5SDimitry Andric 
8013cdc20ff6SDimitry Andric   VPValue *SrcMask = getBlockInMask(Src);
80140b57cec5SDimitry Andric 
80150b57cec5SDimitry Andric   // The terminator has to be a branch inst!
80160b57cec5SDimitry Andric   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
80170b57cec5SDimitry Andric   assert(BI && "Unexpected terminator found");
80180b57cec5SDimitry Andric 
801913138422SDimitry Andric   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
80200b57cec5SDimitry Andric     return EdgeMaskCache[Edge] = SrcMask;
80210b57cec5SDimitry Andric 
8022e8d8bef9SDimitry Andric   // If source is an exiting block, we know the exit edge is dynamically dead
8023e8d8bef9SDimitry Andric   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
8024e8d8bef9SDimitry Andric   // adding uses of an otherwise potentially dead instruction.
8025e8d8bef9SDimitry Andric   if (OrigLoop->isLoopExiting(Src))
8026e8d8bef9SDimitry Andric     return EdgeMaskCache[Edge] = SrcMask;
8027e8d8bef9SDimitry Andric 
8028fe013be4SDimitry Andric   VPValue *EdgeMask = Plan.getVPValueOrAddLiveIn(BI->getCondition());
80290b57cec5SDimitry Andric   assert(EdgeMask && "No Edge Mask found for condition");
80300b57cec5SDimitry Andric 
80310b57cec5SDimitry Andric   if (BI->getSuccessor(0) != Dst)
80320eae32dcSDimitry Andric     EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
80330b57cec5SDimitry Andric 
8034d409305fSDimitry Andric   if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8035d409305fSDimitry Andric     // The condition is 'SrcMask && EdgeMask', which is equivalent to
8036d409305fSDimitry Andric     // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8037d409305fSDimitry Andric     // The select version does not introduce new UB if SrcMask is false and
8038d409305fSDimitry Andric     // EdgeMask is poison. Using 'and' here introduces undefined behavior.
8039fe013be4SDimitry Andric     VPValue *False = Plan.getVPValueOrAddLiveIn(
8040d409305fSDimitry Andric         ConstantInt::getFalse(BI->getCondition()->getType()));
80410eae32dcSDimitry Andric     EdgeMask =
80420eae32dcSDimitry Andric         Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc());
8043d409305fSDimitry Andric   }
80440b57cec5SDimitry Andric 
80450b57cec5SDimitry Andric   return EdgeMaskCache[Edge] = EdgeMask;
80460b57cec5SDimitry Andric }
80470b57cec5SDimitry Andric 
createHeaderMask(VPlan & Plan)8048c9157d92SDimitry Andric void VPRecipeBuilder::createHeaderMask(VPlan &Plan) {
8049c9157d92SDimitry Andric   BasicBlock *Header = OrigLoop->getHeader();
80500b57cec5SDimitry Andric 
8051c9157d92SDimitry Andric   // When not folding the tail, use nullptr to model all-true mask.
8052c9157d92SDimitry Andric   if (!CM.foldTailByMasking()) {
8053c9157d92SDimitry Andric     BlockMaskCache[Header] = nullptr;
8054c9157d92SDimitry Andric     return;
8055c9157d92SDimitry Andric   }
8056753f127fSDimitry Andric 
80570eae32dcSDimitry Andric   // Introduce the early-exit compare IV <= BTC to form header block mask.
805804eeddc0SDimitry Andric   // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
805904eeddc0SDimitry Andric   // constructing the desired canonical IV in the header block as its first
806004eeddc0SDimitry Andric   // non-phi instructions.
8061753f127fSDimitry Andric 
8062fe013be4SDimitry Andric   VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
806304eeddc0SDimitry Andric   auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8064fe013be4SDimitry Andric   auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
8065c9157d92SDimitry Andric   HeaderVPBB->insert(IV, NewInsertionPoint);
80660eae32dcSDimitry Andric 
8067e8d8bef9SDimitry Andric   VPBuilder::InsertPointGuard Guard(Builder);
806804eeddc0SDimitry Andric   Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
8069c9157d92SDimitry Andric   VPValue *BlockMask = nullptr;
8070fe013be4SDimitry Andric   VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
8071c9157d92SDimitry Andric   BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
8072c9157d92SDimitry Andric   BlockMaskCache[Header] = BlockMask;
80730b57cec5SDimitry Andric }
80740b57cec5SDimitry Andric 
getBlockInMask(BasicBlock * BB) const8075cdc20ff6SDimitry Andric VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const {
8076cdc20ff6SDimitry Andric   // Return the cached value.
8077cdc20ff6SDimitry Andric   BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB);
8078cdc20ff6SDimitry Andric   assert(BCEntryIt != BlockMaskCache.end() &&
8079cdc20ff6SDimitry Andric          "Trying to access mask for block without one.");
8080c9157d92SDimitry Andric   return BCEntryIt->second;
8081cdc20ff6SDimitry Andric }
8082c9157d92SDimitry Andric 
createBlockInMask(BasicBlock * BB,VPlan & Plan)8083cdc20ff6SDimitry Andric void VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
8084cdc20ff6SDimitry Andric   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8085cdc20ff6SDimitry Andric   assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed");
8086c9157d92SDimitry Andric   assert(OrigLoop->getHeader() != BB &&
8087c9157d92SDimitry Andric          "Loop header must have cached block mask");
8088c9157d92SDimitry Andric 
8089c9157d92SDimitry Andric   // All-one mask is modelled as no-mask following the convention for masked
8090c9157d92SDimitry Andric   // load/store/gather/scatter. Initialize BlockMask to no-mask.
8091c9157d92SDimitry Andric   VPValue *BlockMask = nullptr;
80920b57cec5SDimitry Andric   // This is the block mask. We OR all incoming edges.
80930b57cec5SDimitry Andric   for (auto *Predecessor : predecessors(BB)) {
80940b57cec5SDimitry Andric     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8095cdc20ff6SDimitry Andric     if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too.
8096cdc20ff6SDimitry Andric       BlockMaskCache[BB] = EdgeMask;
80976c20abcdSDimitry Andric       return;
8098cdc20ff6SDimitry Andric     }
80990b57cec5SDimitry Andric 
81000b57cec5SDimitry Andric     if (!BlockMask) { // BlockMask has its initialized nullptr value.
81010b57cec5SDimitry Andric       BlockMask = EdgeMask;
81020b57cec5SDimitry Andric       continue;
81030b57cec5SDimitry Andric     }
81040b57cec5SDimitry Andric 
81050eae32dcSDimitry Andric     BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
81060b57cec5SDimitry Andric   }
81070b57cec5SDimitry Andric 
8108cdc20ff6SDimitry Andric   BlockMaskCache[BB] = BlockMask;
81090b57cec5SDimitry Andric }
81100b57cec5SDimitry Andric 
tryToWidenMemory(Instruction * I,ArrayRef<VPValue * > Operands,VFRange & Range,VPlanPtr & Plan)8111fe6060f1SDimitry Andric VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
8112fe6060f1SDimitry Andric                                                 ArrayRef<VPValue *> Operands,
8113fe6060f1SDimitry Andric                                                 VFRange &Range,
81140b57cec5SDimitry Andric                                                 VPlanPtr &Plan) {
81155ffd83dbSDimitry Andric   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
81165ffd83dbSDimitry Andric          "Must be called with either a load or store");
81170b57cec5SDimitry Andric 
8118e8d8bef9SDimitry Andric   auto willWiden = [&](ElementCount VF) -> bool {
81190b57cec5SDimitry Andric     LoopVectorizationCostModel::InstWidening Decision =
81200b57cec5SDimitry Andric         CM.getWideningDecision(I, VF);
81210b57cec5SDimitry Andric     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
81220b57cec5SDimitry Andric            "CM decision should be taken at this point.");
8123480093f4SDimitry Andric     if (Decision == LoopVectorizationCostModel::CM_Interleave)
8124480093f4SDimitry Andric       return true;
8125480093f4SDimitry Andric     if (CM.isScalarAfterVectorization(I, VF) ||
8126480093f4SDimitry Andric         CM.isProfitableToScalarize(I, VF))
8127480093f4SDimitry Andric       return false;
81280b57cec5SDimitry Andric     return Decision != LoopVectorizationCostModel::CM_Scalarize;
81290b57cec5SDimitry Andric   };
81300b57cec5SDimitry Andric 
81310b57cec5SDimitry Andric   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
81320b57cec5SDimitry Andric     return nullptr;
81330b57cec5SDimitry Andric 
81340b57cec5SDimitry Andric   VPValue *Mask = nullptr;
81350b57cec5SDimitry Andric   if (Legal->isMaskRequired(I))
8136cdc20ff6SDimitry Andric     Mask = getBlockInMask(I->getParent());
81370b57cec5SDimitry Andric 
8138349cc55cSDimitry Andric   // Determine if the pointer operand of the access is either consecutive or
8139349cc55cSDimitry Andric   // reverse consecutive.
8140349cc55cSDimitry Andric   LoopVectorizationCostModel::InstWidening Decision =
8141349cc55cSDimitry Andric       CM.getWideningDecision(I, Range.Start);
8142349cc55cSDimitry Andric   bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
8143349cc55cSDimitry Andric   bool Consecutive =
8144349cc55cSDimitry Andric       Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
8145349cc55cSDimitry Andric 
8146de8261c4SDimitry Andric   VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
8147de8261c4SDimitry Andric   if (Consecutive) {
8148cdc20ff6SDimitry Andric     auto *GEP = dyn_cast<GetElementPtrInst>(
8149cdc20ff6SDimitry Andric         Ptr->getUnderlyingValue()->stripPointerCasts());
8150cdc20ff6SDimitry Andric     auto *VectorPtr = new VPVectorPointerRecipe(
8151cdc20ff6SDimitry Andric         Ptr, getLoadStoreType(I), Reverse, GEP ? GEP->isInBounds() : false,
8152cdc20ff6SDimitry Andric         I->getDebugLoc());
8153de8261c4SDimitry Andric     Builder.getInsertBlock()->appendRecipe(VectorPtr);
8154de8261c4SDimitry Andric     Ptr = VectorPtr;
8155de8261c4SDimitry Andric   }
81565ffd83dbSDimitry Andric   if (LoadInst *Load = dyn_cast<LoadInst>(I))
8157de8261c4SDimitry Andric     return new VPWidenMemoryInstructionRecipe(*Load, Ptr, Mask, Consecutive,
8158de8261c4SDimitry Andric                                               Reverse);
81595ffd83dbSDimitry Andric 
81605ffd83dbSDimitry Andric   StoreInst *Store = cast<StoreInst>(I);
8161de8261c4SDimitry Andric   return new VPWidenMemoryInstructionRecipe(*Store, Ptr, Operands[0], Mask,
8162de8261c4SDimitry Andric                                             Consecutive, Reverse);
81630b57cec5SDimitry Andric }
81640b57cec5SDimitry Andric 
816581ad6265SDimitry Andric /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
816681ad6265SDimitry Andric /// insert a recipe to expand the step for the induction recipe.
8167fe013be4SDimitry Andric static VPWidenIntOrFpInductionRecipe *
createWidenInductionRecipes(PHINode * Phi,Instruction * PhiOrTrunc,VPValue * Start,const InductionDescriptor & IndDesc,VPlan & Plan,ScalarEvolution & SE,Loop & OrigLoop,VFRange & Range)8168fe013be4SDimitry Andric createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc,
8169fe013be4SDimitry Andric                             VPValue *Start, const InductionDescriptor &IndDesc,
8170fe013be4SDimitry Andric                             VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop,
8171fe013be4SDimitry Andric                             VFRange &Range) {
81721fd87a68SDimitry Andric   assert(IndDesc.getStartValue() ==
81731fd87a68SDimitry Andric          Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
817481ad6265SDimitry Andric   assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
817581ad6265SDimitry Andric          "step must be loop invariant");
817681ad6265SDimitry Andric 
817781ad6265SDimitry Andric   VPValue *Step =
817881ad6265SDimitry Andric       vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE);
81791fd87a68SDimitry Andric   if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8180fe013be4SDimitry Andric     return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI);
81811fd87a68SDimitry Andric   }
81821fd87a68SDimitry Andric   assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
8183fe013be4SDimitry Andric   return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc);
81841fd87a68SDimitry Andric }
81851fd87a68SDimitry Andric 
tryToOptimizeInductionPHI(PHINode * Phi,ArrayRef<VPValue * > Operands,VPlan & Plan,VFRange & Range)818681ad6265SDimitry Andric VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI(
818781ad6265SDimitry Andric     PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) {
81881fd87a68SDimitry Andric 
81890b57cec5SDimitry Andric   // Check if this is an integer or fp induction. If so, build the recipe that
81900b57cec5SDimitry Andric   // produces its scalar and vector values.
81911fd87a68SDimitry Andric   if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8192fe013be4SDimitry Andric     return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan,
819381ad6265SDimitry Andric                                        *PSE.getSE(), *OrigLoop, Range);
81940b57cec5SDimitry Andric 
819581ad6265SDimitry Andric   // Check if this is pointer induction. If so, build the recipe for it.
81966246ae0bSDimitry Andric   if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
8197bdd1243dSDimitry Andric     VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(),
8198bdd1243dSDimitry Andric                                                            *PSE.getSE());
81996246ae0bSDimitry Andric     return new VPWidenPointerInductionRecipe(
8200bdd1243dSDimitry Andric         Phi, Operands[0], Step, *II,
82016246ae0bSDimitry Andric         LoopVectorizationPlanner::getDecisionAndClampRange(
82026246ae0bSDimitry Andric             [&](ElementCount VF) {
82036246ae0bSDimitry Andric               return CM.isScalarAfterVectorization(Phi, VF);
82046246ae0bSDimitry Andric             },
82056246ae0bSDimitry Andric             Range));
82066246ae0bSDimitry Andric   }
82070b57cec5SDimitry Andric   return nullptr;
82080b57cec5SDimitry Andric }
82090b57cec5SDimitry Andric 
tryToOptimizeInductionTruncate(TruncInst * I,ArrayRef<VPValue * > Operands,VFRange & Range,VPlan & Plan)8210fe6060f1SDimitry Andric VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
821181ad6265SDimitry Andric     TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) {
82120b57cec5SDimitry Andric   // Optimize the special case where the source is a constant integer
82130b57cec5SDimitry Andric   // induction variable. Notice that we can only optimize the 'trunc' case
82140b57cec5SDimitry Andric   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
82150b57cec5SDimitry Andric   // (c) other casts depend on pointer size.
82160b57cec5SDimitry Andric 
82170b57cec5SDimitry Andric   // Determine whether \p K is a truncation based on an induction variable that
82180b57cec5SDimitry Andric   // can be optimized.
82190b57cec5SDimitry Andric   auto isOptimizableIVTruncate =
8220e8d8bef9SDimitry Andric       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8221e8d8bef9SDimitry Andric     return [=](ElementCount VF) -> bool {
8222e8d8bef9SDimitry Andric       return CM.isOptimizableIVTruncate(K, VF);
8223e8d8bef9SDimitry Andric     };
82240b57cec5SDimitry Andric   };
82250b57cec5SDimitry Andric 
82265ffd83dbSDimitry Andric   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8227e8d8bef9SDimitry Andric           isOptimizableIVTruncate(I), Range)) {
8228e8d8bef9SDimitry Andric 
82290eae32dcSDimitry Andric     auto *Phi = cast<PHINode>(I->getOperand(0));
82300eae32dcSDimitry Andric     const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8231fe013be4SDimitry Andric     VPValue *Start = Plan.getVPValueOrAddLiveIn(II.getStartValue());
8232fe013be4SDimitry Andric     return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(),
8233fe013be4SDimitry Andric                                        *OrigLoop, Range);
8234e8d8bef9SDimitry Andric   }
82350b57cec5SDimitry Andric   return nullptr;
82360b57cec5SDimitry Andric }
82370b57cec5SDimitry Andric 
tryToBlend(PHINode * Phi,ArrayRef<VPValue * > Operands,VPlanPtr & Plan)8238fe6060f1SDimitry Andric VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
8239fe6060f1SDimitry Andric                                                 ArrayRef<VPValue *> Operands,
8240fe6060f1SDimitry Andric                                                 VPlanPtr &Plan) {
8241fe6060f1SDimitry Andric   // If all incoming values are equal, the incoming VPValue can be used directly
8242fe6060f1SDimitry Andric   // instead of creating a new VPBlendRecipe.
8243bdd1243dSDimitry Andric   if (llvm::all_equal(Operands))
8244fe6060f1SDimitry Andric     return Operands[0];
8245fe6060f1SDimitry Andric 
824681ad6265SDimitry Andric   unsigned NumIncoming = Phi->getNumIncomingValues();
824781ad6265SDimitry Andric   // For in-loop reductions, we do not need to create an additional select.
824881ad6265SDimitry Andric   VPValue *InLoopVal = nullptr;
824981ad6265SDimitry Andric   for (unsigned In = 0; In < NumIncoming; In++) {
825081ad6265SDimitry Andric     PHINode *PhiOp =
825181ad6265SDimitry Andric         dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue());
825281ad6265SDimitry Andric     if (PhiOp && CM.isInLoopReduction(PhiOp)) {
825381ad6265SDimitry Andric       assert(!InLoopVal && "Found more than one in-loop reduction!");
825481ad6265SDimitry Andric       InLoopVal = Operands[In];
825581ad6265SDimitry Andric     }
825681ad6265SDimitry Andric   }
825781ad6265SDimitry Andric 
825881ad6265SDimitry Andric   assert((!InLoopVal || NumIncoming == 2) &&
825981ad6265SDimitry Andric          "Found an in-loop reduction for PHI with unexpected number of "
826081ad6265SDimitry Andric          "incoming values");
826181ad6265SDimitry Andric   if (InLoopVal)
826281ad6265SDimitry Andric     return Operands[Operands[0] == InLoopVal ? 1 : 0];
826381ad6265SDimitry Andric 
82640b57cec5SDimitry Andric   // We know that all PHIs in non-header blocks are converted into selects, so
82650b57cec5SDimitry Andric   // we don't have to worry about the insertion order and we can just use the
82660b57cec5SDimitry Andric   // builder. At this point we generate the predication tree. There may be
82670b57cec5SDimitry Andric   // duplications since this is a simple recursive scan, but future
82680b57cec5SDimitry Andric   // optimizations will clean it up.
8269fe6060f1SDimitry Andric   SmallVector<VPValue *, 2> OperandsWithMask;
8270fe6060f1SDimitry Andric 
82710b57cec5SDimitry Andric   for (unsigned In = 0; In < NumIncoming; In++) {
82720b57cec5SDimitry Andric     VPValue *EdgeMask =
8273fe013be4SDimitry Andric         createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), *Plan);
82740b57cec5SDimitry Andric     assert((EdgeMask || NumIncoming == 1) &&
82750b57cec5SDimitry Andric            "Multiple predecessors with one having a full mask");
8276fe6060f1SDimitry Andric     OperandsWithMask.push_back(Operands[In]);
82770b57cec5SDimitry Andric     if (EdgeMask)
8278fe6060f1SDimitry Andric       OperandsWithMask.push_back(EdgeMask);
82790b57cec5SDimitry Andric   }
8280fe6060f1SDimitry Andric   return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask));
82810b57cec5SDimitry Andric }
82820b57cec5SDimitry Andric 
tryToWidenCall(CallInst * CI,ArrayRef<VPValue * > Operands,VFRange & Range,VPlanPtr & Plan)8283fe6060f1SDimitry Andric VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8284fe6060f1SDimitry Andric                                                    ArrayRef<VPValue *> Operands,
8285fe013be4SDimitry Andric                                                    VFRange &Range,
8286fe013be4SDimitry Andric                                                    VPlanPtr &Plan) {
82870b57cec5SDimitry Andric   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
828804eeddc0SDimitry Andric       [this, CI](ElementCount VF) {
828904eeddc0SDimitry Andric         return CM.isScalarWithPredication(CI, VF);
829004eeddc0SDimitry Andric       },
82915ffd83dbSDimitry Andric       Range);
82920b57cec5SDimitry Andric 
82930b57cec5SDimitry Andric   if (IsPredicated)
82945ffd83dbSDimitry Andric     return nullptr;
82950b57cec5SDimitry Andric 
82965ffd83dbSDimitry Andric   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
82975ffd83dbSDimitry Andric   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8298e8d8bef9SDimitry Andric              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8299e8d8bef9SDimitry Andric              ID == Intrinsic::pseudoprobe ||
8300e8d8bef9SDimitry Andric              ID == Intrinsic::experimental_noalias_scope_decl))
83015ffd83dbSDimitry Andric     return nullptr;
83025ffd83dbSDimitry Andric 
8303fe013be4SDimitry Andric   SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size()));
8304bdd1243dSDimitry Andric 
8305bdd1243dSDimitry Andric   // Is it beneficial to perform intrinsic call compared to lib call?
8306bdd1243dSDimitry Andric   bool ShouldUseVectorIntrinsic =
8307bdd1243dSDimitry Andric       ID && LoopVectorizationPlanner::getDecisionAndClampRange(
8308bdd1243dSDimitry Andric                 [&](ElementCount VF) -> bool {
8309c9157d92SDimitry Andric                   return CM.getCallWideningDecision(CI, VF).Kind ==
8310c9157d92SDimitry Andric                          LoopVectorizationCostModel::CM_IntrinsicCall;
8311bdd1243dSDimitry Andric                 },
8312bdd1243dSDimitry Andric                 Range);
8313bdd1243dSDimitry Andric   if (ShouldUseVectorIntrinsic)
8314a58f00eaSDimitry Andric     return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID,
8315a58f00eaSDimitry Andric                                  CI->getDebugLoc());
8316bdd1243dSDimitry Andric 
8317fe013be4SDimitry Andric   Function *Variant = nullptr;
8318c9157d92SDimitry Andric   std::optional<unsigned> MaskPos;
8319bdd1243dSDimitry Andric   // Is better to call a vectorized version of the function than to to scalarize
8320bdd1243dSDimitry Andric   // the call?
8321bdd1243dSDimitry Andric   auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
8322bdd1243dSDimitry Andric       [&](ElementCount VF) -> bool {
8323bdd1243dSDimitry Andric         // The following case may be scalarized depending on the VF.
8324bdd1243dSDimitry Andric         // The flag shows whether we can use a usual Call for vectorized
8325bdd1243dSDimitry Andric         // version of the instruction.
8326fe013be4SDimitry Andric 
8327fe013be4SDimitry Andric         // If we've found a variant at a previous VF, then stop looking. A
8328fe013be4SDimitry Andric         // vectorized variant of a function expects input in a certain shape
8329fe013be4SDimitry Andric         // -- basically the number of input registers, the number of lanes
8330fe013be4SDimitry Andric         // per register, and whether there's a mask required.
8331fe013be4SDimitry Andric         // We store a pointer to the variant in the VPWidenCallRecipe, so
8332fe013be4SDimitry Andric         // once we have an appropriate variant it's only valid for that VF.
8333fe013be4SDimitry Andric         // This will force a different vplan to be generated for each VF that
8334fe013be4SDimitry Andric         // finds a valid variant.
8335fe013be4SDimitry Andric         if (Variant)
8336fe013be4SDimitry Andric           return false;
8337c9157d92SDimitry Andric         LoopVectorizationCostModel::CallWideningDecision Decision =
8338c9157d92SDimitry Andric             CM.getCallWideningDecision(CI, VF);
8339c9157d92SDimitry Andric         if (Decision.Kind == LoopVectorizationCostModel::CM_VectorCall) {
8340c9157d92SDimitry Andric           Variant = Decision.Variant;
8341c9157d92SDimitry Andric           MaskPos = Decision.MaskPos;
8342c9157d92SDimitry Andric           return true;
8343c9157d92SDimitry Andric         }
8344c9157d92SDimitry Andric 
8345c9157d92SDimitry Andric         return false;
8346bdd1243dSDimitry Andric       },
8347bdd1243dSDimitry Andric       Range);
8348fe013be4SDimitry Andric   if (ShouldUseVectorCall) {
8349c9157d92SDimitry Andric     if (MaskPos.has_value()) {
8350fe013be4SDimitry Andric       // We have 2 cases that would require a mask:
8351fe013be4SDimitry Andric       //   1) The block needs to be predicated, either due to a conditional
8352fe013be4SDimitry Andric       //      in the scalar loop or use of an active lane mask with
8353fe013be4SDimitry Andric       //      tail-folding, and we use the appropriate mask for the block.
8354fe013be4SDimitry Andric       //   2) No mask is required for the block, but the only available
8355fe013be4SDimitry Andric       //      vector variant at this VF requires a mask, so we synthesize an
8356fe013be4SDimitry Andric       //      all-true mask.
8357fe013be4SDimitry Andric       VPValue *Mask = nullptr;
8358fe013be4SDimitry Andric       if (Legal->isMaskRequired(CI))
8359cdc20ff6SDimitry Andric         Mask = getBlockInMask(CI->getParent());
8360fe013be4SDimitry Andric       else
8361fe013be4SDimitry Andric         Mask = Plan->getVPValueOrAddLiveIn(ConstantInt::getTrue(
8362fe013be4SDimitry Andric             IntegerType::getInt1Ty(Variant->getFunctionType()->getContext())));
8363fe013be4SDimitry Andric 
8364c9157d92SDimitry Andric       Ops.insert(Ops.begin() + *MaskPos, Mask);
8365fe013be4SDimitry Andric     }
8366fe013be4SDimitry Andric 
8367bdd1243dSDimitry Andric     return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()),
8368a58f00eaSDimitry Andric                                  Intrinsic::not_intrinsic, CI->getDebugLoc(),
8369a58f00eaSDimitry Andric                                  Variant);
8370fe013be4SDimitry Andric   }
8371bdd1243dSDimitry Andric 
8372bdd1243dSDimitry Andric   return nullptr;
83735ffd83dbSDimitry Andric }
83745ffd83dbSDimitry Andric 
shouldWiden(Instruction * I,VFRange & Range) const83755ffd83dbSDimitry Andric bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
83765ffd83dbSDimitry Andric   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
83775ffd83dbSDimitry Andric          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
83785ffd83dbSDimitry Andric   // Instruction should be widened, unless it is scalar after vectorization,
83795ffd83dbSDimitry Andric   // scalarization is profitable or it is predicated.
8380e8d8bef9SDimitry Andric   auto WillScalarize = [this, I](ElementCount VF) -> bool {
83815ffd83dbSDimitry Andric     return CM.isScalarAfterVectorization(I, VF) ||
838204eeddc0SDimitry Andric            CM.isProfitableToScalarize(I, VF) ||
838304eeddc0SDimitry Andric            CM.isScalarWithPredication(I, VF);
83845ffd83dbSDimitry Andric   };
83855ffd83dbSDimitry Andric   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
83865ffd83dbSDimitry Andric                                                              Range);
83875ffd83dbSDimitry Andric }
83885ffd83dbSDimitry Andric 
tryToWiden(Instruction * I,ArrayRef<VPValue * > Operands,VPBasicBlock * VPBB,VPlanPtr & Plan)8389bdd1243dSDimitry Andric VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I,
8390bdd1243dSDimitry Andric                                           ArrayRef<VPValue *> Operands,
8391bdd1243dSDimitry Andric                                           VPBasicBlock *VPBB, VPlanPtr &Plan) {
8392bdd1243dSDimitry Andric   switch (I->getOpcode()) {
8393bdd1243dSDimitry Andric   default:
8394bdd1243dSDimitry Andric     return nullptr;
8395bdd1243dSDimitry Andric   case Instruction::SDiv:
8396bdd1243dSDimitry Andric   case Instruction::UDiv:
8397bdd1243dSDimitry Andric   case Instruction::SRem:
8398bdd1243dSDimitry Andric   case Instruction::URem: {
8399bdd1243dSDimitry Andric     // If not provably safe, use a select to form a safe divisor before widening the
8400bdd1243dSDimitry Andric     // div/rem operation itself.  Otherwise fall through to general handling below.
8401bdd1243dSDimitry Andric     if (CM.isPredicatedInst(I)) {
8402bdd1243dSDimitry Andric       SmallVector<VPValue *> Ops(Operands.begin(), Operands.end());
8403cdc20ff6SDimitry Andric       VPValue *Mask = getBlockInMask(I->getParent());
8404fe013be4SDimitry Andric       VPValue *One = Plan->getVPValueOrAddLiveIn(
8405fe013be4SDimitry Andric           ConstantInt::get(I->getType(), 1u, false));
8406bdd1243dSDimitry Andric       auto *SafeRHS =
8407bdd1243dSDimitry Andric          new VPInstruction(Instruction::Select, {Mask, Ops[1], One},
8408bdd1243dSDimitry Andric                            I->getDebugLoc());
8409bdd1243dSDimitry Andric       VPBB->appendRecipe(SafeRHS);
8410bdd1243dSDimitry Andric       Ops[1] = SafeRHS;
8411bdd1243dSDimitry Andric       return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end()));
8412bdd1243dSDimitry Andric     }
8413fe013be4SDimitry Andric     [[fallthrough]];
8414bdd1243dSDimitry Andric   }
84150b57cec5SDimitry Andric   case Instruction::Add:
84160b57cec5SDimitry Andric   case Instruction::And:
84170b57cec5SDimitry Andric   case Instruction::AShr:
84180b57cec5SDimitry Andric   case Instruction::FAdd:
84190b57cec5SDimitry Andric   case Instruction::FCmp:
84200b57cec5SDimitry Andric   case Instruction::FDiv:
84210b57cec5SDimitry Andric   case Instruction::FMul:
84220b57cec5SDimitry Andric   case Instruction::FNeg:
84230b57cec5SDimitry Andric   case Instruction::FRem:
84240b57cec5SDimitry Andric   case Instruction::FSub:
84250b57cec5SDimitry Andric   case Instruction::ICmp:
84260b57cec5SDimitry Andric   case Instruction::LShr:
84270b57cec5SDimitry Andric   case Instruction::Mul:
84280b57cec5SDimitry Andric   case Instruction::Or:
84290b57cec5SDimitry Andric   case Instruction::Select:
84300b57cec5SDimitry Andric   case Instruction::Shl:
84310b57cec5SDimitry Andric   case Instruction::Sub:
84320b57cec5SDimitry Andric   case Instruction::Xor:
843381ad6265SDimitry Andric   case Instruction::Freeze:
8434fe6060f1SDimitry Andric     return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8435bdd1243dSDimitry Andric   };
8436fe6060f1SDimitry Andric }
8437fe6060f1SDimitry Andric 
fixHeaderPhis()8438fe6060f1SDimitry Andric void VPRecipeBuilder::fixHeaderPhis() {
8439fe6060f1SDimitry Andric   BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
844004eeddc0SDimitry Andric   for (VPHeaderPHIRecipe *R : PhisToFix) {
8441fe6060f1SDimitry Andric     auto *PN = cast<PHINode>(R->getUnderlyingValue());
8442fe6060f1SDimitry Andric     VPRecipeBase *IncR =
8443fe6060f1SDimitry Andric         getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8444fe6060f1SDimitry Andric     R->addOperand(IncR->getVPSingleValue());
8445fe6060f1SDimitry Andric   }
84460b57cec5SDimitry Andric }
84470b57cec5SDimitry Andric 
handleReplication(Instruction * I,VFRange & Range,VPlan & Plan)8448fe013be4SDimitry Andric VPRecipeOrVPValueTy VPRecipeBuilder::handleReplication(Instruction *I,
8449fe013be4SDimitry Andric                                                        VFRange &Range,
8450fe013be4SDimitry Andric                                                        VPlan &Plan) {
84510b57cec5SDimitry Andric   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8452e8d8bef9SDimitry Andric       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
84530b57cec5SDimitry Andric       Range);
84540b57cec5SDimitry Andric 
8455bdd1243dSDimitry Andric   bool IsPredicated = CM.isPredicatedInst(I);
84560b57cec5SDimitry Andric 
84576e75b2fbSDimitry Andric   // Even if the instruction is not marked as uniform, there are certain
84586e75b2fbSDimitry Andric   // intrinsic calls that can be effectively treated as such, so we check for
84596e75b2fbSDimitry Andric   // them here. Conservatively, we only do this for scalable vectors, since
84606e75b2fbSDimitry Andric   // for fixed-width VFs we can always fall back on full scalarization.
84616e75b2fbSDimitry Andric   if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
84626e75b2fbSDimitry Andric     switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
84636e75b2fbSDimitry Andric     case Intrinsic::assume:
84646e75b2fbSDimitry Andric     case Intrinsic::lifetime_start:
84656e75b2fbSDimitry Andric     case Intrinsic::lifetime_end:
84666e75b2fbSDimitry Andric       // For scalable vectors if one of the operands is variant then we still
84676e75b2fbSDimitry Andric       // want to mark as uniform, which will generate one instruction for just
84686e75b2fbSDimitry Andric       // the first lane of the vector. We can't scalarize the call in the same
84696e75b2fbSDimitry Andric       // way as for fixed-width vectors because we don't know how many lanes
84706e75b2fbSDimitry Andric       // there are.
84716e75b2fbSDimitry Andric       //
84726e75b2fbSDimitry Andric       // The reasons for doing it this way for scalable vectors are:
84736e75b2fbSDimitry Andric       //   1. For the assume intrinsic generating the instruction for the first
84746e75b2fbSDimitry Andric       //      lane is still be better than not generating any at all. For
84756e75b2fbSDimitry Andric       //      example, the input may be a splat across all lanes.
84766e75b2fbSDimitry Andric       //   2. For the lifetime start/end intrinsics the pointer operand only
84776e75b2fbSDimitry Andric       //      does anything useful when the input comes from a stack object,
84786e75b2fbSDimitry Andric       //      which suggests it should always be uniform. For non-stack objects
84796e75b2fbSDimitry Andric       //      the effect is to poison the object, which still allows us to
84806e75b2fbSDimitry Andric       //      remove the call.
84816e75b2fbSDimitry Andric       IsUniform = true;
84826e75b2fbSDimitry Andric       break;
84836e75b2fbSDimitry Andric     default:
84846e75b2fbSDimitry Andric       break;
84856e75b2fbSDimitry Andric     }
84866e75b2fbSDimitry Andric   }
8487fe013be4SDimitry Andric   VPValue *BlockInMask = nullptr;
84880b57cec5SDimitry Andric   if (!IsPredicated) {
8489fe013be4SDimitry Andric     // Finalize the recipe for Instr, first if it is not predicated.
84900b57cec5SDimitry Andric     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8491fcaf7f86SDimitry Andric   } else {
8492fe013be4SDimitry Andric     LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8493fe013be4SDimitry Andric     // Instructions marked for predication are replicated and a mask operand is
8494fe013be4SDimitry Andric     // added initially. Masked replicate recipes will later be placed under an
8495fe013be4SDimitry Andric     // if-then construct to prevent side-effects. Generate recipes to compute
8496fe013be4SDimitry Andric     // the block mask for this region.
8497cdc20ff6SDimitry Andric     BlockInMask = getBlockInMask(I->getParent());
8498fe6060f1SDimitry Andric   }
8499fcaf7f86SDimitry Andric 
8500fe013be4SDimitry Andric   auto *Recipe = new VPReplicateRecipe(I, Plan.mapToVPValues(I->operands()),
8501fe013be4SDimitry Andric                                        IsUniform, BlockInMask);
8502fe013be4SDimitry Andric   return toVPRecipeResult(Recipe);
85030b57cec5SDimitry Andric }
85040b57cec5SDimitry Andric 
8505fe6060f1SDimitry Andric VPRecipeOrVPValueTy
tryToCreateWidenRecipe(Instruction * Instr,ArrayRef<VPValue * > Operands,VFRange & Range,VPBasicBlock * VPBB,VPlanPtr & Plan)8506fe6060f1SDimitry Andric VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8507fe6060f1SDimitry Andric                                         ArrayRef<VPValue *> Operands,
8508bdd1243dSDimitry Andric                                         VFRange &Range, VPBasicBlock *VPBB,
8509bdd1243dSDimitry Andric                                         VPlanPtr &Plan) {
851081ad6265SDimitry Andric   // First, check for specific widening recipes that deal with inductions, Phi
851181ad6265SDimitry Andric   // nodes, calls and memory operations.
85125ffd83dbSDimitry Andric   VPRecipeBase *Recipe;
85135ffd83dbSDimitry Andric   if (auto Phi = dyn_cast<PHINode>(Instr)) {
85145ffd83dbSDimitry Andric     if (Phi->getParent() != OrigLoop->getHeader())
8515fe6060f1SDimitry Andric       return tryToBlend(Phi, Operands, Plan);
8516bdd1243dSDimitry Andric 
8517bdd1243dSDimitry Andric     // Always record recipes for header phis. Later first-order recurrence phis
8518bdd1243dSDimitry Andric     // can have earlier phis as incoming values.
8519bdd1243dSDimitry Andric     recordRecipeOf(Phi);
8520bdd1243dSDimitry Andric 
852181ad6265SDimitry Andric     if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range)))
8522fe6060f1SDimitry Andric       return toVPRecipeResult(Recipe);
8523e8d8bef9SDimitry Andric 
852404eeddc0SDimitry Andric     VPHeaderPHIRecipe *PhiRecipe = nullptr;
852581ad6265SDimitry Andric     assert((Legal->isReductionVariable(Phi) ||
8526bdd1243dSDimitry Andric             Legal->isFixedOrderRecurrence(Phi)) &&
8527bdd1243dSDimitry Andric            "can only widen reductions and fixed-order recurrences here");
8528fe6060f1SDimitry Andric     VPValue *StartV = Operands[0];
8529e8d8bef9SDimitry Andric     if (Legal->isReductionVariable(Phi)) {
85300eae32dcSDimitry Andric       const RecurrenceDescriptor &RdxDesc =
85310eae32dcSDimitry Andric           Legal->getReductionVars().find(Phi)->second;
8532fe6060f1SDimitry Andric       assert(RdxDesc.getRecurrenceStartValue() ==
8533fe6060f1SDimitry Andric              Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8534fe6060f1SDimitry Andric       PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8535fe6060f1SDimitry Andric                                            CM.isInLoopReduction(Phi),
8536fe6060f1SDimitry Andric                                            CM.useOrderedReductions(RdxDesc));
8537fe6060f1SDimitry Andric     } else {
8538bdd1243dSDimitry Andric       // TODO: Currently fixed-order recurrences are modeled as chains of
8539bdd1243dSDimitry Andric       // first-order recurrences. If there are no users of the intermediate
8540bdd1243dSDimitry Andric       // recurrences in the chain, the fixed order recurrence should be modeled
8541bdd1243dSDimitry Andric       // directly, enabling more efficient codegen.
8542fe6060f1SDimitry Andric       PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8543e8d8bef9SDimitry Andric     }
8544e8d8bef9SDimitry Andric 
8545fe6060f1SDimitry Andric     // Record the incoming value from the backedge, so we can add the incoming
8546fe6060f1SDimitry Andric     // value from the backedge after all recipes have been created.
8547bdd1243dSDimitry Andric     auto *Inc = cast<Instruction>(
8548bdd1243dSDimitry Andric         Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
8549bdd1243dSDimitry Andric     auto RecipeIter = Ingredient2Recipe.find(Inc);
8550bdd1243dSDimitry Andric     if (RecipeIter == Ingredient2Recipe.end())
8551bdd1243dSDimitry Andric       recordRecipeOf(Inc);
8552bdd1243dSDimitry Andric 
8553fe6060f1SDimitry Andric     PhisToFix.push_back(PhiRecipe);
8554fe6060f1SDimitry Andric     return toVPRecipeResult(PhiRecipe);
8555fe6060f1SDimitry Andric   }
8556fe6060f1SDimitry Andric 
8557fe6060f1SDimitry Andric   if (isa<TruncInst>(Instr) &&
8558fe6060f1SDimitry Andric       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,
8559fe6060f1SDimitry Andric                                                Range, *Plan)))
8560fe6060f1SDimitry Andric     return toVPRecipeResult(Recipe);
85615ffd83dbSDimitry Andric 
856281ad6265SDimitry Andric   // All widen recipes below deal only with VF > 1.
856381ad6265SDimitry Andric   if (LoopVectorizationPlanner::getDecisionAndClampRange(
856481ad6265SDimitry Andric           [&](ElementCount VF) { return VF.isScalar(); }, Range))
856581ad6265SDimitry Andric     return nullptr;
856681ad6265SDimitry Andric 
856781ad6265SDimitry Andric   if (auto *CI = dyn_cast<CallInst>(Instr))
8568fe013be4SDimitry Andric     return toVPRecipeResult(tryToWidenCall(CI, Operands, Range, Plan));
856981ad6265SDimitry Andric 
857081ad6265SDimitry Andric   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
857181ad6265SDimitry Andric     return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
857281ad6265SDimitry Andric 
85735ffd83dbSDimitry Andric   if (!shouldWiden(Instr, Range))
85745ffd83dbSDimitry Andric     return nullptr;
85755ffd83dbSDimitry Andric 
85765ffd83dbSDimitry Andric   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8577fe6060f1SDimitry Andric     return toVPRecipeResult(new VPWidenGEPRecipe(
8578fe013be4SDimitry Andric         GEP, make_range(Operands.begin(), Operands.end())));
85795ffd83dbSDimitry Andric 
85805ffd83dbSDimitry Andric   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8581fe6060f1SDimitry Andric     return toVPRecipeResult(new VPWidenSelectRecipe(
8582fe013be4SDimitry Andric         *SI, make_range(Operands.begin(), Operands.end())));
8583fe013be4SDimitry Andric   }
8584fe013be4SDimitry Andric 
8585fe013be4SDimitry Andric   if (auto *CI = dyn_cast<CastInst>(Instr)) {
8586c9157d92SDimitry Andric     return toVPRecipeResult(new VPWidenCastRecipe(CI->getOpcode(), Operands[0],
8587c9157d92SDimitry Andric                                                   CI->getType(), *CI));
85880b57cec5SDimitry Andric   }
85890b57cec5SDimitry Andric 
8590bdd1243dSDimitry Andric   return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan));
85910b57cec5SDimitry Andric }
85920b57cec5SDimitry Andric 
buildVPlansWithVPRecipes(ElementCount MinVF,ElementCount MaxVF)8593e8d8bef9SDimitry Andric void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8594e8d8bef9SDimitry Andric                                                         ElementCount MaxVF) {
8595e8d8bef9SDimitry Andric   assert(OrigLoop->isInnermost() && "Inner loop expected.");
85960b57cec5SDimitry Andric 
8597fe013be4SDimitry Andric   auto MaxVFTimes2 = MaxVF * 2;
8598fe013be4SDimitry Andric   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
8599fe013be4SDimitry Andric     VFRange SubRange = {VF, MaxVFTimes2};
8600c9157d92SDimitry Andric     if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) {
8601c9157d92SDimitry Andric       // Now optimize the initial VPlan.
8602c9157d92SDimitry Andric       if (!Plan->hasVF(ElementCount::getFixed(1)))
8603c9157d92SDimitry Andric         VPlanTransforms::truncateToMinimalBitwidths(
8604c9157d92SDimitry Andric             *Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext());
8605c9157d92SDimitry Andric       VPlanTransforms::optimize(*Plan, *PSE.getSE());
8606c9157d92SDimitry Andric       assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
8607c9157d92SDimitry Andric       VPlans.push_back(std::move(Plan));
8608c9157d92SDimitry Andric     }
86090b57cec5SDimitry Andric     VF = SubRange.End;
86100b57cec5SDimitry Andric   }
86110b57cec5SDimitry Andric }
86120b57cec5SDimitry Andric 
8613753f127fSDimitry Andric // Add the necessary canonical IV and branch recipes required to control the
8614753f127fSDimitry Andric // loop.
addCanonicalIVRecipes(VPlan & Plan,Type * IdxTy,bool HasNUW,DebugLoc DL)8615c9157d92SDimitry Andric static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
8616c9157d92SDimitry Andric                                   DebugLoc DL) {
861704eeddc0SDimitry Andric   Value *StartIdx = ConstantInt::get(IdxTy, 0);
8618fe013be4SDimitry Andric   auto *StartV = Plan.getVPValueOrAddLiveIn(StartIdx);
861904eeddc0SDimitry Andric 
8620753f127fSDimitry Andric   // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
862104eeddc0SDimitry Andric   auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
862204eeddc0SDimitry Andric   VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
862304eeddc0SDimitry Andric   VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
862404eeddc0SDimitry Andric   Header->insert(CanonicalIVPHI, Header->begin());
862504eeddc0SDimitry Andric 
8626753f127fSDimitry Andric   // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar
8627753f127fSDimitry Andric   // IV by VF * UF.
862804eeddc0SDimitry Andric   auto *CanonicalIVIncrement =
8629c9157d92SDimitry Andric       new VPInstruction(Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()},
8630c9157d92SDimitry Andric                         {HasNUW, false}, DL, "index.next");
863104eeddc0SDimitry Andric   CanonicalIVPHI->addOperand(CanonicalIVIncrement);
863204eeddc0SDimitry Andric 
863381ad6265SDimitry Andric   VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
8634fe013be4SDimitry Andric   EB->appendRecipe(CanonicalIVIncrement);
8635fe013be4SDimitry Andric 
8636753f127fSDimitry Andric   // Add the BranchOnCount VPInstruction to the latch.
8637c9157d92SDimitry Andric   VPInstruction *BranchBack =
8638c9157d92SDimitry Andric       new VPInstruction(VPInstruction::BranchOnCount,
863904eeddc0SDimitry Andric                         {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
8640753f127fSDimitry Andric   EB->appendRecipe(BranchBack);
8641753f127fSDimitry Andric }
864204eeddc0SDimitry Andric 
864381ad6265SDimitry Andric // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
864481ad6265SDimitry Andric // original exit block.
addUsersInExitBlock(VPBasicBlock * HeaderVPBB,Loop * OrigLoop,VPlan & Plan)8645c9157d92SDimitry Andric static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop,
864681ad6265SDimitry Andric                                 VPlan &Plan) {
864781ad6265SDimitry Andric   BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock();
864881ad6265SDimitry Andric   BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
864981ad6265SDimitry Andric   // Only handle single-exit loops with unique exit blocks for now.
865081ad6265SDimitry Andric   if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB)
865181ad6265SDimitry Andric     return;
865281ad6265SDimitry Andric 
865381ad6265SDimitry Andric   // Introduce VPUsers modeling the exit values.
865481ad6265SDimitry Andric   for (PHINode &ExitPhi : ExitBB->phis()) {
865581ad6265SDimitry Andric     Value *IncomingValue =
865681ad6265SDimitry Andric         ExitPhi.getIncomingValueForBlock(ExitingBB);
8657fe013be4SDimitry Andric     VPValue *V = Plan.getVPValueOrAddLiveIn(IncomingValue);
865881ad6265SDimitry Andric     Plan.addLiveOut(&ExitPhi, V);
865981ad6265SDimitry Andric   }
866081ad6265SDimitry Andric }
866181ad6265SDimitry Andric 
8662c9157d92SDimitry Andric VPlanPtr
tryToBuildVPlanWithVPRecipes(VFRange & Range)8663c9157d92SDimitry Andric LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
86640b57cec5SDimitry Andric 
8665480093f4SDimitry Andric   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8666480093f4SDimitry Andric 
86675ffd83dbSDimitry Andric   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
8668480093f4SDimitry Andric 
8669480093f4SDimitry Andric   // ---------------------------------------------------------------------------
8670480093f4SDimitry Andric   // Pre-construction: record ingredients whose recipes we'll need to further
8671480093f4SDimitry Andric   // process after constructing the initial VPlan.
8672480093f4SDimitry Andric   // ---------------------------------------------------------------------------
8673480093f4SDimitry Andric 
8674480093f4SDimitry Andric   // For each interleave group which is relevant for this (possibly trimmed)
8675480093f4SDimitry Andric   // Range, add it to the set of groups to be later applied to the VPlan and add
8676480093f4SDimitry Andric   // placeholders for its members' Recipes which we'll be replacing with a
8677480093f4SDimitry Andric   // single VPInterleaveRecipe.
8678480093f4SDimitry Andric   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8679e8d8bef9SDimitry Andric     auto applyIG = [IG, this](ElementCount VF) -> bool {
8680fe013be4SDimitry Andric       bool Result = (VF.isVector() && // Query is illegal for VF == 1
8681480093f4SDimitry Andric                      CM.getWideningDecision(IG->getInsertPos(), VF) ==
8682480093f4SDimitry Andric                          LoopVectorizationCostModel::CM_Interleave);
8683fe013be4SDimitry Andric       // For scalable vectors, the only interleave factor currently supported
8684fe013be4SDimitry Andric       // is 2 since we require the (de)interleave2 intrinsics instead of
8685fe013be4SDimitry Andric       // shufflevectors.
8686fe013be4SDimitry Andric       assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
8687fe013be4SDimitry Andric              "Unsupported interleave factor for scalable vectors");
8688fe013be4SDimitry Andric       return Result;
8689480093f4SDimitry Andric     };
8690480093f4SDimitry Andric     if (!getDecisionAndClampRange(applyIG, Range))
8691480093f4SDimitry Andric       continue;
8692480093f4SDimitry Andric     InterleaveGroups.insert(IG);
8693480093f4SDimitry Andric     for (unsigned i = 0; i < IG->getFactor(); i++)
8694480093f4SDimitry Andric       if (Instruction *Member = IG->getMember(i))
8695480093f4SDimitry Andric         RecipeBuilder.recordRecipeOf(Member);
8696480093f4SDimitry Andric   };
8697480093f4SDimitry Andric 
8698480093f4SDimitry Andric   // ---------------------------------------------------------------------------
8699480093f4SDimitry Andric   // Build initial VPlan: Scan the body of the loop in a topological order to
8700480093f4SDimitry Andric   // visit each basic block after having visited its predecessor basic blocks.
8701480093f4SDimitry Andric   // ---------------------------------------------------------------------------
87020b57cec5SDimitry Andric 
8703fe013be4SDimitry Andric   // Create initial VPlan skeleton, having a basic block for the pre-header
8704fe013be4SDimitry Andric   // which contains SCEV expansions that need to happen before the CFG is
8705fe013be4SDimitry Andric   // modified; a basic block for the vector pre-header, followed by a region for
8706fe013be4SDimitry Andric   // the vector loop, followed by the middle basic block. The skeleton vector
8707fe013be4SDimitry Andric   // loop region contains a header and latch basic blocks.
8708fe013be4SDimitry Andric   VPlanPtr Plan = VPlan::createInitialVPlan(
8709fe013be4SDimitry Andric       createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
8710fe013be4SDimitry Andric       *PSE.getSE());
871181ad6265SDimitry Andric   VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
87120eae32dcSDimitry Andric   VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
87130eae32dcSDimitry Andric   VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
8714c9157d92SDimitry Andric   Plan->getVectorLoopRegion()->setEntry(HeaderVPBB);
8715c9157d92SDimitry Andric   Plan->getVectorLoopRegion()->setExiting(LatchVPBB);
87160b57cec5SDimitry Andric 
8717fe013be4SDimitry Andric   // Don't use getDecisionAndClampRange here, because we don't know the UF
8718fe013be4SDimitry Andric   // so this function is better to be conservative, rather than to split
8719fe013be4SDimitry Andric   // it up into different VPlans.
8720c9157d92SDimitry Andric   // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
8721fe013be4SDimitry Andric   bool IVUpdateMayOverflow = false;
8722fe013be4SDimitry Andric   for (ElementCount VF : Range)
8723fe013be4SDimitry Andric     IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
8724fe013be4SDimitry Andric 
8725c9157d92SDimitry Andric   DebugLoc DL = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
8726c9157d92SDimitry Andric   TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
8727c9157d92SDimitry Andric   // When not folding the tail, we know that the induction increment will not
8728c9157d92SDimitry Andric   // overflow.
8729c9157d92SDimitry Andric   bool HasNUW = Style == TailFoldingStyle::None;
8730c9157d92SDimitry Andric   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
8731c9157d92SDimitry Andric 
87320b57cec5SDimitry Andric   // Scan the body of the loop in a topological order to visit each basic block
87330b57cec5SDimitry Andric   // after having visited its predecessor basic blocks.
87340b57cec5SDimitry Andric   LoopBlocksDFS DFS(OrigLoop);
87350b57cec5SDimitry Andric   DFS.perform(LI);
87360b57cec5SDimitry Andric 
87370eae32dcSDimitry Andric   VPBasicBlock *VPBB = HeaderVPBB;
8738cdc20ff6SDimitry Andric   bool NeedsMasks = CM.foldTailByMasking() ||
8739cdc20ff6SDimitry Andric                     any_of(OrigLoop->blocks(), [this](BasicBlock *BB) {
8740cdc20ff6SDimitry Andric                       return Legal->blockNeedsPredication(BB);
8741cdc20ff6SDimitry Andric                     });
87420b57cec5SDimitry Andric   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
87430b57cec5SDimitry Andric     // Relevant instructions from basic block BB will be grouped into VPRecipe
87440b57cec5SDimitry Andric     // ingredients and fill a new VPBasicBlock.
874581ad6265SDimitry Andric     if (VPBB != HeaderVPBB)
87460eae32dcSDimitry Andric       VPBB->setName(BB->getName());
87470b57cec5SDimitry Andric     Builder.setInsertPoint(VPBB);
87480b57cec5SDimitry Andric 
8749cdc20ff6SDimitry Andric     if (VPBB == HeaderVPBB)
8750cdc20ff6SDimitry Andric       RecipeBuilder.createHeaderMask(*Plan);
8751cdc20ff6SDimitry Andric     else if (NeedsMasks)
8752cdc20ff6SDimitry Andric       RecipeBuilder.createBlockInMask(BB, *Plan);
8753cdc20ff6SDimitry Andric 
8754480093f4SDimitry Andric     // Introduce each ingredient into VPlan.
875581ad6265SDimitry Andric     // TODO: Model and preserve debug intrinsics in VPlan.
8756c9157d92SDimitry Andric     for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
87570b57cec5SDimitry Andric       Instruction *Instr = &I;
8758fe6060f1SDimitry Andric       SmallVector<VPValue *, 4> Operands;
8759fe6060f1SDimitry Andric       auto *Phi = dyn_cast<PHINode>(Instr);
8760fe6060f1SDimitry Andric       if (Phi && Phi->getParent() == OrigLoop->getHeader()) {
8761fe013be4SDimitry Andric         Operands.push_back(Plan->getVPValueOrAddLiveIn(
8762fe6060f1SDimitry Andric             Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
8763fe6060f1SDimitry Andric       } else {
8764fe6060f1SDimitry Andric         auto OpRange = Plan->mapToVPValues(Instr->operands());
8765fe6060f1SDimitry Andric         Operands = {OpRange.begin(), OpRange.end()};
8766fe6060f1SDimitry Andric       }
876781ad6265SDimitry Andric 
876881ad6265SDimitry Andric       // Invariant stores inside loop will be deleted and a single store
876981ad6265SDimitry Andric       // with the final reduction value will be added to the exit block
877081ad6265SDimitry Andric       StoreInst *SI;
877181ad6265SDimitry Andric       if ((SI = dyn_cast<StoreInst>(&I)) &&
877281ad6265SDimitry Andric           Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
877381ad6265SDimitry Andric         continue;
877481ad6265SDimitry Andric 
8775fe013be4SDimitry Andric       auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
8776fe013be4SDimitry Andric           Instr, Operands, Range, VPBB, Plan);
8777fe013be4SDimitry Andric       if (!RecipeOrValue)
8778fe013be4SDimitry Andric         RecipeOrValue = RecipeBuilder.handleReplication(Instr, Range, *Plan);
8779fe6060f1SDimitry Andric       // If Instr can be simplified to an existing VPValue, use it.
8780fe013be4SDimitry Andric       if (isa<VPValue *>(RecipeOrValue)) {
8781fe013be4SDimitry Andric         auto *VPV = cast<VPValue *>(RecipeOrValue);
8782fe6060f1SDimitry Andric         Plan->addVPValue(Instr, VPV);
8783fe6060f1SDimitry Andric         // If the re-used value is a recipe, register the recipe for the
8784fe6060f1SDimitry Andric         // instruction, in case the recipe for Instr needs to be recorded.
8785bdd1243dSDimitry Andric         if (VPRecipeBase *R = VPV->getDefiningRecipe())
8786fe6060f1SDimitry Andric           RecipeBuilder.setRecipe(Instr, R);
8787fe6060f1SDimitry Andric         continue;
8788fe6060f1SDimitry Andric       }
8789fe6060f1SDimitry Andric       // Otherwise, add the new recipe.
8790fe013be4SDimitry Andric       VPRecipeBase *Recipe = cast<VPRecipeBase *>(RecipeOrValue);
8791e8d8bef9SDimitry Andric       for (auto *Def : Recipe->definedValues()) {
8792e8d8bef9SDimitry Andric         auto *UV = Def->getUnderlyingValue();
8793e8d8bef9SDimitry Andric         Plan->addVPValue(UV, Def);
8794e8d8bef9SDimitry Andric       }
8795e8d8bef9SDimitry Andric 
8796fe013be4SDimitry Andric       RecipeBuilder.setRecipe(Instr, Recipe);
8797c9157d92SDimitry Andric       if (isa<VPHeaderPHIRecipe>(Recipe)) {
8798c9157d92SDimitry Andric         // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In
8799c9157d92SDimitry Andric         // the following cases, VPHeaderPHIRecipes may be created after non-phi
8800c9157d92SDimitry Andric         // recipes and need to be moved to the phi section of HeaderVPBB:
8801c9157d92SDimitry Andric         // * tail-folding (non-phi recipes computing the header mask are
8802c9157d92SDimitry Andric         // introduced earlier than regular header phi recipes, and should appear
8803c9157d92SDimitry Andric         // after them)
8804c9157d92SDimitry Andric         // * Optimizing truncates to VPWidenIntOrFpInductionRecipe.
8805c9157d92SDimitry Andric 
8806c9157d92SDimitry Andric         assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() ||
8807c9157d92SDimitry Andric                 CM.foldTailByMasking() || isa<TruncInst>(Instr)) &&
8808c9157d92SDimitry Andric                "unexpected recipe needs moving");
8809fe013be4SDimitry Andric         Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
8810fe013be4SDimitry Andric       } else
88115ffd83dbSDimitry Andric         VPBB->appendRecipe(Recipe);
88120b57cec5SDimitry Andric     }
88130eae32dcSDimitry Andric 
88140eae32dcSDimitry Andric     VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
88150eae32dcSDimitry Andric     VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
88160b57cec5SDimitry Andric   }
88170b57cec5SDimitry Andric 
88180eae32dcSDimitry Andric   // After here, VPBB should not be used.
88190eae32dcSDimitry Andric   VPBB = nullptr;
88200eae32dcSDimitry Andric 
8821fe013be4SDimitry Andric   if (CM.requiresScalarEpilogue(Range)) {
8822fe013be4SDimitry Andric     // No edge from the middle block to the unique exit block has been inserted
8823fe013be4SDimitry Andric     // and there is nothing to fix from vector loop; phis should have incoming
8824fe013be4SDimitry Andric     // from scalar loop only.
8825fe013be4SDimitry Andric   } else
8826c9157d92SDimitry Andric     addUsersInExitBlock(HeaderVPBB, OrigLoop, *Plan);
882781ad6265SDimitry Andric 
882881ad6265SDimitry Andric   assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
882981ad6265SDimitry Andric          !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
88304824e7fdSDimitry Andric          "entry block must be set to a VPRegionBlock having a non-empty entry "
88314824e7fdSDimitry Andric          "VPBasicBlock");
8832fe6060f1SDimitry Andric   RecipeBuilder.fixHeaderPhis();
8833fe6060f1SDimitry Andric 
8834480093f4SDimitry Andric   // ---------------------------------------------------------------------------
8835480093f4SDimitry Andric   // Transform initial VPlan: Apply previously taken decisions, in order, to
8836480093f4SDimitry Andric   // bring the VPlan to its final state.
8837480093f4SDimitry Andric   // ---------------------------------------------------------------------------
8838480093f4SDimitry Andric 
8839349cc55cSDimitry Andric   // Adjust the recipes for any inloop reductions.
8840c9157d92SDimitry Andric   adjustRecipesForReductions(LatchVPBB, Plan, RecipeBuilder, Range.Start);
8841349cc55cSDimitry Andric 
8842480093f4SDimitry Andric   // Interleave memory: for each Interleave Group we marked earlier as relevant
8843480093f4SDimitry Andric   // for this VPlan, replace the Recipes widening its memory instructions with a
8844480093f4SDimitry Andric   // single VPInterleaveRecipe at its insertion point.
8845bdd1243dSDimitry Andric   for (const auto *IG : InterleaveGroups) {
8846480093f4SDimitry Andric     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
8847480093f4SDimitry Andric         RecipeBuilder.getRecipe(IG->getInsertPos()));
8848e8d8bef9SDimitry Andric     SmallVector<VPValue *, 4> StoredValues;
8849e8d8bef9SDimitry Andric     for (unsigned i = 0; i < IG->getFactor(); ++i)
8850fe6060f1SDimitry Andric       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
8851fe6060f1SDimitry Andric         auto *StoreR =
8852fe6060f1SDimitry Andric             cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI));
8853fe6060f1SDimitry Andric         StoredValues.push_back(StoreR->getStoredValue());
8854fe6060f1SDimitry Andric       }
8855480093f4SDimitry Andric 
8856fe013be4SDimitry Andric     bool NeedsMaskForGaps =
8857fe013be4SDimitry Andric         IG->requiresScalarEpilogue() && !CM.isScalarEpilogueAllowed();
8858e8d8bef9SDimitry Andric     auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
8859fe013be4SDimitry Andric                                         Recipe->getMask(), NeedsMaskForGaps);
8860e8d8bef9SDimitry Andric     VPIG->insertBefore(Recipe);
8861e8d8bef9SDimitry Andric     unsigned J = 0;
8862480093f4SDimitry Andric     for (unsigned i = 0; i < IG->getFactor(); ++i)
8863480093f4SDimitry Andric       if (Instruction *Member = IG->getMember(i)) {
8864fe013be4SDimitry Andric         VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
8865e8d8bef9SDimitry Andric         if (!Member->getType()->isVoidTy()) {
8866fe013be4SDimitry Andric           VPValue *OriginalV = MemberR->getVPSingleValue();
8867e8d8bef9SDimitry Andric           OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
8868e8d8bef9SDimitry Andric           J++;
8869e8d8bef9SDimitry Andric         }
8870fe013be4SDimitry Andric         MemberR->eraseFromParent();
8871480093f4SDimitry Andric       }
8872480093f4SDimitry Andric   }
8873480093f4SDimitry Andric 
8874fe013be4SDimitry Andric   for (ElementCount VF : Range)
88750b57cec5SDimitry Andric     Plan->addVF(VF);
8876bdd1243dSDimitry Andric   Plan->setName("Initial VPlan");
88770b57cec5SDimitry Andric 
8878fe013be4SDimitry Andric   // Replace VPValues for known constant strides guaranteed by predicate scalar
8879fe013be4SDimitry Andric   // evolution.
8880fe013be4SDimitry Andric   for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) {
8881fe013be4SDimitry Andric     auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
8882fe013be4SDimitry Andric     auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV));
8883fe013be4SDimitry Andric     // Only handle constant strides for now.
8884fe013be4SDimitry Andric     if (!ScevStride)
8885fe013be4SDimitry Andric       continue;
8886fe013be4SDimitry Andric     Constant *CI = ConstantInt::get(Stride->getType(), ScevStride->getAPInt());
8887fe013be4SDimitry Andric 
8888fe013be4SDimitry Andric     auto *ConstVPV = Plan->getVPValueOrAddLiveIn(CI);
8889fe013be4SDimitry Andric     // The versioned value may not be used in the loop directly, so just add a
8890fe013be4SDimitry Andric     // new live-in in those cases.
8891fe013be4SDimitry Andric     Plan->getVPValueOrAddLiveIn(StrideV)->replaceAllUsesWith(ConstVPV);
8892fe013be4SDimitry Andric   }
8893fe013be4SDimitry Andric 
889481ad6265SDimitry Andric   // From this point onwards, VPlan-to-VPlan transformations may change the plan
889581ad6265SDimitry Andric   // in ways that accessing values using original IR values is incorrect.
889681ad6265SDimitry Andric   Plan->disableValue2VPValue();
889781ad6265SDimitry Andric 
8898fe013be4SDimitry Andric   // Sink users of fixed-order recurrence past the recipe defining the previous
8899fe013be4SDimitry Andric   // value and introduce FirstOrderRecurrenceSplice VPInstructions.
8900fe013be4SDimitry Andric   if (!VPlanTransforms::adjustFixedOrderRecurrences(*Plan, Builder))
8901c9157d92SDimitry Andric     return nullptr;
8902fe013be4SDimitry Andric 
8903c9157d92SDimitry Andric   if (useActiveLaneMask(Style)) {
8904c9157d92SDimitry Andric     // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
8905c9157d92SDimitry Andric     // TailFoldingStyle is visible there.
8906c9157d92SDimitry Andric     bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
8907c9157d92SDimitry Andric     bool WithoutRuntimeCheck =
8908c9157d92SDimitry Andric         Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
8909c9157d92SDimitry Andric     VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
8910c9157d92SDimitry Andric                                        WithoutRuntimeCheck);
8911c9157d92SDimitry Andric   }
8912c9157d92SDimitry Andric   return Plan;
89130b57cec5SDimitry Andric }
89140b57cec5SDimitry Andric 
buildVPlan(VFRange & Range)89150b57cec5SDimitry Andric VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
89160b57cec5SDimitry Andric   // Outer loop handling: They may require CFG and instruction level
89170b57cec5SDimitry Andric   // transformations before even evaluating whether vectorization is profitable.
89180b57cec5SDimitry Andric   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
89190b57cec5SDimitry Andric   // the vectorization pipeline.
8920e8d8bef9SDimitry Andric   assert(!OrigLoop->isInnermost());
89210b57cec5SDimitry Andric   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
89220b57cec5SDimitry Andric 
89230b57cec5SDimitry Andric   // Create new empty VPlan
8924fe013be4SDimitry Andric   auto Plan = VPlan::createInitialVPlan(
8925fe013be4SDimitry Andric       createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
8926fe013be4SDimitry Andric       *PSE.getSE());
89270b57cec5SDimitry Andric 
89280b57cec5SDimitry Andric   // Build hierarchical CFG
89290b57cec5SDimitry Andric   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
89300b57cec5SDimitry Andric   HCFGBuilder.buildHierarchicalCFG();
89310b57cec5SDimitry Andric 
8932fe013be4SDimitry Andric   for (ElementCount VF : Range)
89330b57cec5SDimitry Andric     Plan->addVF(VF);
89340b57cec5SDimitry Andric 
89350eae32dcSDimitry Andric   VPlanTransforms::VPInstructionsToVPRecipes(
8936fe013be4SDimitry Andric       Plan,
89370eae32dcSDimitry Andric       [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
8938fe013be4SDimitry Andric       *PSE.getSE(), *TLI);
893904eeddc0SDimitry Andric 
894081ad6265SDimitry Andric   // Remove the existing terminator of the exiting block of the top-most region.
894181ad6265SDimitry Andric   // A BranchOnCount will be added instead when adding the canonical IV recipes.
894281ad6265SDimitry Andric   auto *Term =
894381ad6265SDimitry Andric       Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
894481ad6265SDimitry Andric   Term->eraseFromParent();
894581ad6265SDimitry Andric 
8946c9157d92SDimitry Andric   // Tail folding is not supported for outer loops, so the induction increment
8947c9157d92SDimitry Andric   // is guaranteed to not wrap.
8948c9157d92SDimitry Andric   bool HasNUW = true;
8949c9157d92SDimitry Andric   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
8950c9157d92SDimitry Andric                         DebugLoc());
89510b57cec5SDimitry Andric   return Plan;
89520b57cec5SDimitry Andric }
89530b57cec5SDimitry Andric 
8954349cc55cSDimitry Andric // Adjust the recipes for reductions. For in-loop reductions the chain of
8955349cc55cSDimitry Andric // instructions leading from the loop exit instr to the phi need to be converted
8956349cc55cSDimitry Andric // to reductions, with one operand being vector and the other being the scalar
8957349cc55cSDimitry Andric // reduction chain. For other reductions, a select is introduced between the phi
8958349cc55cSDimitry Andric // and live-out recipes when folding the tail.
8959cdc20ff6SDimitry Andric //
8960cdc20ff6SDimitry Andric // A ComputeReductionResult recipe is added to the middle block, also for
8961cdc20ff6SDimitry Andric // in-loop reductions which compute their result in-loop, because generating
8962cdc20ff6SDimitry Andric // the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
adjustRecipesForReductions(VPBasicBlock * LatchVPBB,VPlanPtr & Plan,VPRecipeBuilder & RecipeBuilder,ElementCount MinVF)8963349cc55cSDimitry Andric void LoopVectorizationPlanner::adjustRecipesForReductions(
8964349cc55cSDimitry Andric     VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
8965349cc55cSDimitry Andric     ElementCount MinVF) {
8966cdc20ff6SDimitry Andric   VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
8967cdc20ff6SDimitry Andric   VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
8968c9157d92SDimitry Andric   // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
8969c9157d92SDimitry Andric   // sank outside of the loop would keep the same order as they had in the
8970c9157d92SDimitry Andric   // original loop.
8971c9157d92SDimitry Andric   SmallVector<VPReductionPHIRecipe *> ReductionPHIList;
8972c9157d92SDimitry Andric   for (VPRecipeBase &R : Header->phis()) {
8973c9157d92SDimitry Andric     if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
8974c9157d92SDimitry Andric       ReductionPHIList.emplace_back(ReductionPhi);
8975c9157d92SDimitry Andric   }
8976c9157d92SDimitry Andric   bool HasIntermediateStore = false;
8977c9157d92SDimitry Andric   stable_sort(ReductionPHIList,
8978c9157d92SDimitry Andric               [this, &HasIntermediateStore](const VPReductionPHIRecipe *R1,
8979c9157d92SDimitry Andric                                             const VPReductionPHIRecipe *R2) {
8980c9157d92SDimitry Andric                 auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore;
8981c9157d92SDimitry Andric                 auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore;
8982c9157d92SDimitry Andric                 HasIntermediateStore |= IS1 || IS2;
8983e8d8bef9SDimitry Andric 
8984c9157d92SDimitry Andric                 // If neither of the recipes has an intermediate store, keep the
8985c9157d92SDimitry Andric                 // order the same.
8986c9157d92SDimitry Andric                 if (!IS1 && !IS2)
8987c9157d92SDimitry Andric                   return false;
8988c9157d92SDimitry Andric 
8989c9157d92SDimitry Andric                 // If only one of the recipes has an intermediate store, then
8990c9157d92SDimitry Andric                 // move it towards the beginning of the list.
8991c9157d92SDimitry Andric                 if (IS1 && !IS2)
8992c9157d92SDimitry Andric                   return true;
8993c9157d92SDimitry Andric 
8994c9157d92SDimitry Andric                 if (!IS1 && IS2)
8995c9157d92SDimitry Andric                   return false;
8996c9157d92SDimitry Andric 
8997c9157d92SDimitry Andric                 // If both recipes have an intermediate store, then the recipe
8998c9157d92SDimitry Andric                 // with the later store should be processed earlier. So it
8999c9157d92SDimitry Andric                 // should go to the beginning of the list.
9000c9157d92SDimitry Andric                 return DT->dominates(IS2, IS1);
9001c9157d92SDimitry Andric               });
9002c9157d92SDimitry Andric 
9003c9157d92SDimitry Andric   if (HasIntermediateStore && ReductionPHIList.size() > 1)
9004c9157d92SDimitry Andric     for (VPRecipeBase *R : ReductionPHIList)
9005c9157d92SDimitry Andric       R->moveBefore(*Header, Header->getFirstNonPhi());
9006c9157d92SDimitry Andric 
9007c9157d92SDimitry Andric   for (VPRecipeBase &R : Header->phis()) {
9008c9157d92SDimitry Andric     auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9009c9157d92SDimitry Andric     if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
9010fe6060f1SDimitry Andric       continue;
9011fe6060f1SDimitry Andric 
9012c9157d92SDimitry Andric     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9013e8d8bef9SDimitry Andric     RecurKind Kind = RdxDesc.getRecurrenceKind();
9014c9157d92SDimitry Andric     assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
9015c9157d92SDimitry Andric            "AnyOf reductions are not allowed for in-loop reductions");
9016e8d8bef9SDimitry Andric 
9017c9157d92SDimitry Andric     // Collect the chain of "link" recipes for the reduction starting at PhiR.
9018a58f00eaSDimitry Andric     SetVector<VPSingleDefRecipe *> Worklist;
9019c9157d92SDimitry Andric     Worklist.insert(PhiR);
9020c9157d92SDimitry Andric     for (unsigned I = 0; I != Worklist.size(); ++I) {
9021a58f00eaSDimitry Andric       VPSingleDefRecipe *Cur = Worklist[I];
9022a58f00eaSDimitry Andric       for (VPUser *U : Cur->users()) {
9023a58f00eaSDimitry Andric         auto *UserRecipe = dyn_cast<VPSingleDefRecipe>(U);
9024a58f00eaSDimitry Andric         if (!UserRecipe) {
9025a58f00eaSDimitry Andric           assert(isa<VPLiveOut>(U) &&
9026a58f00eaSDimitry Andric                  "U must either be a VPSingleDef or VPLiveOut");
9027c9157d92SDimitry Andric           continue;
9028a58f00eaSDimitry Andric         }
9029c9157d92SDimitry Andric         Worklist.insert(UserRecipe);
9030c9157d92SDimitry Andric       }
9031c9157d92SDimitry Andric     }
9032c9157d92SDimitry Andric 
9033c9157d92SDimitry Andric     // Visit operation "Links" along the reduction chain top-down starting from
9034c9157d92SDimitry Andric     // the phi until LoopExitValue. We keep track of the previous item
9035c9157d92SDimitry Andric     // (PreviousLink) to tell which of the two operands of a Link will remain
9036c9157d92SDimitry Andric     // scalar and which will be reduced. For minmax by select(cmp), Link will be
9037c9157d92SDimitry Andric     // the select instructions.
9038a58f00eaSDimitry Andric     VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0].
9039a58f00eaSDimitry Andric     for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) {
9040c9157d92SDimitry Andric       Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
9041c9157d92SDimitry Andric 
9042c9157d92SDimitry Andric       // Index of the first operand which holds a non-mask vector operand.
9043c9157d92SDimitry Andric       unsigned IndexOfFirstOperand;
90444824e7fdSDimitry Andric       // Recognize a call to the llvm.fmuladd intrinsic.
90454824e7fdSDimitry Andric       bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9046c9157d92SDimitry Andric       VPValue *VecOp;
9047c9157d92SDimitry Andric       VPBasicBlock *LinkVPBB = CurrentLink->getParent();
90484824e7fdSDimitry Andric       if (IsFMulAdd) {
9049c9157d92SDimitry Andric         assert(
9050c9157d92SDimitry Andric             RecurrenceDescriptor::isFMulAddIntrinsic(CurrentLinkI) &&
9051c9157d92SDimitry Andric             "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9052c9157d92SDimitry Andric         assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) ||
9053c9157d92SDimitry Andric                 isa<VPWidenCallRecipe>(CurrentLink)) &&
9054a58f00eaSDimitry Andric                CurrentLink->getOperand(2) == PreviousLink &&
9055c9157d92SDimitry Andric                "expected a call where the previous link is the added operand");
9056c9157d92SDimitry Andric 
90574824e7fdSDimitry Andric         // If the instruction is a call to the llvm.fmuladd intrinsic then we
9058c9157d92SDimitry Andric         // need to create an fmul recipe (multiplying the first two operands of
9059c9157d92SDimitry Andric         // the fmuladd together) to use as the vector operand for the fadd
9060c9157d92SDimitry Andric         // reduction.
90614824e7fdSDimitry Andric         VPInstruction *FMulRecipe = new VPInstruction(
9062c9157d92SDimitry Andric             Instruction::FMul,
9063c9157d92SDimitry Andric             {CurrentLink->getOperand(0), CurrentLink->getOperand(1)},
9064c9157d92SDimitry Andric             CurrentLinkI->getFastMathFlags());
9065c9157d92SDimitry Andric         LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());
90664824e7fdSDimitry Andric         VecOp = FMulRecipe;
9067c9157d92SDimitry Andric       } else {
9068c9157d92SDimitry Andric         if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9069c9157d92SDimitry Andric           if (isa<VPWidenRecipe>(CurrentLink)) {
9070c9157d92SDimitry Andric             assert(isa<CmpInst>(CurrentLinkI) &&
9071c9157d92SDimitry Andric                    "need to have the compare of the select");
9072c9157d92SDimitry Andric             continue;
90734824e7fdSDimitry Andric           }
9074c9157d92SDimitry Andric           assert(isa<VPWidenSelectRecipe>(CurrentLink) &&
9075c9157d92SDimitry Andric                  "must be a select recipe");
9076c9157d92SDimitry Andric           IndexOfFirstOperand = 1;
9077c9157d92SDimitry Andric         } else {
9078c9157d92SDimitry Andric           assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) &&
9079c9157d92SDimitry Andric                  "Expected to replace a VPWidenSC");
9080c9157d92SDimitry Andric           IndexOfFirstOperand = 0;
9081c9157d92SDimitry Andric         }
9082c9157d92SDimitry Andric         // Note that for non-commutable operands (cmp-selects), the semantics of
9083c9157d92SDimitry Andric         // the cmp-select are captured in the recurrence kind.
9084c9157d92SDimitry Andric         unsigned VecOpId =
9085a58f00eaSDimitry Andric             CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLink
9086c9157d92SDimitry Andric                 ? IndexOfFirstOperand + 1
9087c9157d92SDimitry Andric                 : IndexOfFirstOperand;
9088c9157d92SDimitry Andric         VecOp = CurrentLink->getOperand(VecOpId);
9089a58f00eaSDimitry Andric         assert(VecOp != PreviousLink &&
9090c9157d92SDimitry Andric                CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 -
9091c9157d92SDimitry Andric                                        (VecOpId - IndexOfFirstOperand)) ==
9092a58f00eaSDimitry Andric                    PreviousLink &&
9093a58f00eaSDimitry Andric                "PreviousLink must be the operand other than VecOp");
9094c9157d92SDimitry Andric       }
9095c9157d92SDimitry Andric 
9096c9157d92SDimitry Andric       BasicBlock *BB = CurrentLinkI->getParent();
9097c9157d92SDimitry Andric       VPValue *CondOp = nullptr;
9098c9157d92SDimitry Andric       if (CM.blockNeedsPredicationForAnyReason(BB)) {
9099c9157d92SDimitry Andric         VPBuilder::InsertPointGuard Guard(Builder);
9100c9157d92SDimitry Andric         Builder.setInsertPoint(CurrentLink);
9101cdc20ff6SDimitry Andric         CondOp = RecipeBuilder.getBlockInMask(BB);
9102c9157d92SDimitry Andric       }
9103c9157d92SDimitry Andric 
9104c9157d92SDimitry Andric       VPReductionRecipe *RedRecipe = new VPReductionRecipe(
9105a58f00eaSDimitry Andric           RdxDesc, CurrentLinkI, PreviousLink, VecOp, CondOp);
9106753f127fSDimitry Andric       // Append the recipe to the end of the VPBasicBlock because we need to
9107753f127fSDimitry Andric       // ensure that it comes after all of it's inputs, including CondOp.
9108c9157d92SDimitry Andric       // Note that this transformation may leave over dead recipes (including
9109c9157d92SDimitry Andric       // CurrentLink), which will be cleaned by a later VPlan transform.
9110c9157d92SDimitry Andric       LinkVPBB->appendRecipe(RedRecipe);
9111a58f00eaSDimitry Andric       CurrentLink->replaceAllUsesWith(RedRecipe);
9112c9157d92SDimitry Andric       PreviousLink = RedRecipe;
9113e8d8bef9SDimitry Andric     }
9114e8d8bef9SDimitry Andric   }
9115c9157d92SDimitry Andric   Builder.setInsertPoint(&*LatchVPBB->begin());
911681ad6265SDimitry Andric   for (VPRecipeBase &R :
911781ad6265SDimitry Andric        Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9118349cc55cSDimitry Andric     VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9119cdc20ff6SDimitry Andric     if (!PhiR)
9120349cc55cSDimitry Andric       continue;
9121c9157d92SDimitry Andric 
9122c9157d92SDimitry Andric     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9123c9157d92SDimitry Andric     // If tail is folded by masking, introduce selects between the phi
9124c9157d92SDimitry Andric     // and the live-out instruction of each reduction, at the beginning of the
9125c9157d92SDimitry Andric     // dedicated latch block.
9126cdc20ff6SDimitry Andric     auto *OrigExitingVPV = PhiR->getBackedgeValue();
9127cdc20ff6SDimitry Andric     auto *NewExitingVPV = PhiR->getBackedgeValue();
9128cdc20ff6SDimitry Andric     if (!PhiR->isInLoop() && CM.foldTailByMasking()) {
9129cdc20ff6SDimitry Andric       VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader());
9130cdc20ff6SDimitry Andric       assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB &&
913104eeddc0SDimitry Andric              "reduction recipe must be defined before latch");
9132c9157d92SDimitry Andric       Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType();
9133cdc20ff6SDimitry Andric       std::optional<FastMathFlags> FMFs =
9134c9157d92SDimitry Andric           PhiTy->isFloatingPointTy()
9135cdc20ff6SDimitry Andric               ? std::make_optional(RdxDesc.getFastMathFlags())
9136cdc20ff6SDimitry Andric               : std::nullopt;
9137cdc20ff6SDimitry Andric       NewExitingVPV =
9138cdc20ff6SDimitry Andric           Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs);
9139cdc20ff6SDimitry Andric       OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) {
9140cdc20ff6SDimitry Andric         return isa<VPInstruction>(&U) &&
9141cdc20ff6SDimitry Andric                cast<VPInstruction>(&U)->getOpcode() ==
9142cdc20ff6SDimitry Andric                    VPInstruction::ComputeReductionResult;
9143cdc20ff6SDimitry Andric       });
9144c9157d92SDimitry Andric       if (PreferPredicatedReductionSelect ||
9145c9157d92SDimitry Andric           TTI.preferPredicatedReductionSelect(
9146c9157d92SDimitry Andric               PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy,
9147c9157d92SDimitry Andric               TargetTransformInfo::ReductionFlags()))
9148cdc20ff6SDimitry Andric         PhiR->setOperand(1, NewExitingVPV);
9149c9157d92SDimitry Andric     }
9150cdc20ff6SDimitry Andric 
9151c9157d92SDimitry Andric     // If the vector reduction can be performed in a smaller type, we truncate
9152c9157d92SDimitry Andric     // then extend the loop exit value to enable InstCombine to evaluate the
9153c9157d92SDimitry Andric     // entire expression in the smaller type.
9154c9157d92SDimitry Andric     Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType();
9155c9157d92SDimitry Andric     if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
9156c9157d92SDimitry Andric       assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
9157c9157d92SDimitry Andric       Type *RdxTy = RdxDesc.getRecurrenceType();
9158cdc20ff6SDimitry Andric       auto *Trunc =
9159cdc20ff6SDimitry Andric           new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy);
9160c9157d92SDimitry Andric       auto *Extnd =
9161c9157d92SDimitry Andric           RdxDesc.isSigned()
9162c9157d92SDimitry Andric               ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy)
9163c9157d92SDimitry Andric               : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy);
9164c9157d92SDimitry Andric 
9165cdc20ff6SDimitry Andric       Trunc->insertAfter(NewExitingVPV->getDefiningRecipe());
9166c9157d92SDimitry Andric       Extnd->insertAfter(Trunc);
9167cdc20ff6SDimitry Andric       if (PhiR->getOperand(1) == NewExitingVPV)
9168cdc20ff6SDimitry Andric         PhiR->setOperand(1, Extnd->getVPSingleValue());
9169cdc20ff6SDimitry Andric       NewExitingVPV = Extnd;
9170349cc55cSDimitry Andric     }
9171cdc20ff6SDimitry Andric 
9172cdc20ff6SDimitry Andric     // We want code in the middle block to appear to execute on the location of
9173cdc20ff6SDimitry Andric     // the scalar loop's latch terminator because: (a) it is all compiler
9174cdc20ff6SDimitry Andric     // generated, (b) these instructions are always executed after evaluating
9175cdc20ff6SDimitry Andric     // the latch conditional branch, and (c) other passes may add new
9176cdc20ff6SDimitry Andric     // predecessors which terminate on this line. This is the easiest way to
9177cdc20ff6SDimitry Andric     // ensure we don't accidentally cause an extra step back into the loop while
9178cdc20ff6SDimitry Andric     // debugging.
9179cdc20ff6SDimitry Andric     DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
9180cdc20ff6SDimitry Andric 
9181cdc20ff6SDimitry Andric     // TODO: At the moment ComputeReductionResult also drives creation of the
9182cdc20ff6SDimitry Andric     // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
9183cdc20ff6SDimitry Andric     // even for in-loop reductions, until the reduction resume value handling is
9184cdc20ff6SDimitry Andric     // also modeled in VPlan.
9185cdc20ff6SDimitry Andric     auto *FinalReductionResult = new VPInstruction(
9186cdc20ff6SDimitry Andric         VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
9187cdc20ff6SDimitry Andric     cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor())
9188cdc20ff6SDimitry Andric         ->appendRecipe(FinalReductionResult);
9189cdc20ff6SDimitry Andric     OrigExitingVPV->replaceUsesWithIf(
9190cdc20ff6SDimitry Andric         FinalReductionResult,
9191cdc20ff6SDimitry Andric         [](VPUser &User, unsigned) { return isa<VPLiveOut>(&User); });
9192349cc55cSDimitry Andric   }
9193fe013be4SDimitry Andric 
9194fe013be4SDimitry Andric   VPlanTransforms::clearReductionWrapFlags(*Plan);
9195e8d8bef9SDimitry Andric }
9196e8d8bef9SDimitry Andric 
9197fe6060f1SDimitry Andric #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const91985ffd83dbSDimitry Andric void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
91995ffd83dbSDimitry Andric                                VPSlotTracker &SlotTracker) const {
9200fe6060f1SDimitry Andric   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
92010b57cec5SDimitry Andric   IG->getInsertPos()->printAsOperand(O, false);
92020b57cec5SDimitry Andric   O << ", ";
92035ffd83dbSDimitry Andric   getAddr()->printAsOperand(O, SlotTracker);
9204480093f4SDimitry Andric   VPValue *Mask = getMask();
9205480093f4SDimitry Andric   if (Mask) {
9206480093f4SDimitry Andric     O << ", ";
92075ffd83dbSDimitry Andric     Mask->printAsOperand(O, SlotTracker);
92080b57cec5SDimitry Andric   }
9209349cc55cSDimitry Andric 
9210349cc55cSDimitry Andric   unsigned OpIdx = 0;
9211349cc55cSDimitry Andric   for (unsigned i = 0; i < IG->getFactor(); ++i) {
9212349cc55cSDimitry Andric     if (!IG->getMember(i))
9213349cc55cSDimitry Andric       continue;
9214349cc55cSDimitry Andric     if (getNumStoreOperands() > 0) {
9215349cc55cSDimitry Andric       O << "\n" << Indent << "  store ";
9216349cc55cSDimitry Andric       getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
9217349cc55cSDimitry Andric       O << " to index " << i;
9218349cc55cSDimitry Andric     } else {
9219349cc55cSDimitry Andric       O << "\n" << Indent << "  ";
9220349cc55cSDimitry Andric       getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
9221349cc55cSDimitry Andric       O << " = load from index " << i;
9222349cc55cSDimitry Andric     }
9223349cc55cSDimitry Andric     ++OpIdx;
9224349cc55cSDimitry Andric   }
92255ffd83dbSDimitry Andric }
9226fe6060f1SDimitry Andric #endif
92275ffd83dbSDimitry Andric 
execute(VPTransformState & State)922881ad6265SDimitry Andric void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
922981ad6265SDimitry Andric   assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&
923081ad6265SDimitry Andric          "Not a pointer induction according to InductionDescriptor!");
923181ad6265SDimitry Andric   assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
923281ad6265SDimitry Andric          "Unexpected type.");
923381ad6265SDimitry Andric 
923481ad6265SDimitry Andric   auto *IVR = getParent()->getPlan()->getCanonicalIV();
923581ad6265SDimitry Andric   PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0));
923681ad6265SDimitry Andric 
923781ad6265SDimitry Andric   if (onlyScalarsGenerated(State.VF)) {
923881ad6265SDimitry Andric     // This is the normalized GEP that starts counting at zero.
923981ad6265SDimitry Andric     Value *PtrInd = State.Builder.CreateSExtOrTrunc(
924081ad6265SDimitry Andric         CanonicalIV, IndDesc.getStep()->getType());
924181ad6265SDimitry Andric     // Determine the number of scalars we need to generate for each unroll
924281ad6265SDimitry Andric     // iteration. If the instruction is uniform, we only need to generate the
924381ad6265SDimitry Andric     // first lane. Otherwise, we generate all VF values.
924481ad6265SDimitry Andric     bool IsUniform = vputils::onlyFirstLaneUsed(this);
924581ad6265SDimitry Andric     assert((IsUniform || !State.VF.isScalable()) &&
924681ad6265SDimitry Andric            "Cannot scalarize a scalable VF");
924781ad6265SDimitry Andric     unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
924881ad6265SDimitry Andric 
924981ad6265SDimitry Andric     for (unsigned Part = 0; Part < State.UF; ++Part) {
925081ad6265SDimitry Andric       Value *PartStart =
925181ad6265SDimitry Andric           createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part);
925281ad6265SDimitry Andric 
925381ad6265SDimitry Andric       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
925481ad6265SDimitry Andric         Value *Idx = State.Builder.CreateAdd(
925581ad6265SDimitry Andric             PartStart, ConstantInt::get(PtrInd->getType(), Lane));
925681ad6265SDimitry Andric         Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx);
925781ad6265SDimitry Andric 
9258fe013be4SDimitry Andric         Value *Step = State.get(getOperand(1), VPIteration(Part, Lane));
925981ad6265SDimitry Andric         Value *SclrGep = emitTransformedIndex(
9260c9157d92SDimitry Andric             State.Builder, GlobalIdx, IndDesc.getStartValue(), Step,
9261c9157d92SDimitry Andric             IndDesc.getKind(), IndDesc.getInductionBinOp());
926281ad6265SDimitry Andric         SclrGep->setName("next.gep");
926381ad6265SDimitry Andric         State.set(this, SclrGep, VPIteration(Part, Lane));
926481ad6265SDimitry Andric       }
926581ad6265SDimitry Andric     }
926681ad6265SDimitry Andric     return;
926781ad6265SDimitry Andric   }
926881ad6265SDimitry Andric 
926981ad6265SDimitry Andric   Type *PhiType = IndDesc.getStep()->getType();
927081ad6265SDimitry Andric 
927181ad6265SDimitry Andric   // Build a pointer phi
927281ad6265SDimitry Andric   Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
927381ad6265SDimitry Andric   Type *ScStValueType = ScalarStartValue->getType();
927481ad6265SDimitry Andric   PHINode *NewPointerPhi =
927581ad6265SDimitry Andric       PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);
927681ad6265SDimitry Andric 
927781ad6265SDimitry Andric   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
927881ad6265SDimitry Andric   NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
927981ad6265SDimitry Andric 
928081ad6265SDimitry Andric   // A pointer induction, performed by using a gep
928181ad6265SDimitry Andric   Instruction *InductionLoc = &*State.Builder.GetInsertPoint();
928281ad6265SDimitry Andric 
9283bdd1243dSDimitry Andric   Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0));
928481ad6265SDimitry Andric   Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
928581ad6265SDimitry Andric   Value *NumUnrolledElems =
928681ad6265SDimitry Andric       State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
928781ad6265SDimitry Andric   Value *InductionGEP = GetElementPtrInst::Create(
9288fe013be4SDimitry Andric       State.Builder.getInt8Ty(), NewPointerPhi,
928981ad6265SDimitry Andric       State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
929081ad6265SDimitry Andric       InductionLoc);
929181ad6265SDimitry Andric   // Add induction update using an incorrect block temporarily. The phi node
929281ad6265SDimitry Andric   // will be fixed after VPlan execution. Note that at this point the latch
929381ad6265SDimitry Andric   // block cannot be used, as it does not exist yet.
929481ad6265SDimitry Andric   // TODO: Model increment value in VPlan, by turning the recipe into a
929581ad6265SDimitry Andric   // multi-def and a subclass of VPHeaderPHIRecipe.
929681ad6265SDimitry Andric   NewPointerPhi->addIncoming(InductionGEP, VectorPH);
929781ad6265SDimitry Andric 
929881ad6265SDimitry Andric   // Create UF many actual address geps that use the pointer
929981ad6265SDimitry Andric   // phi as base and a vectorized version of the step value
930081ad6265SDimitry Andric   // (<step*0, ..., step*N>) as offset.
930181ad6265SDimitry Andric   for (unsigned Part = 0; Part < State.UF; ++Part) {
930281ad6265SDimitry Andric     Type *VecPhiType = VectorType::get(PhiType, State.VF);
930381ad6265SDimitry Andric     Value *StartOffsetScalar =
930481ad6265SDimitry Andric         State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
930581ad6265SDimitry Andric     Value *StartOffset =
930681ad6265SDimitry Andric         State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
930781ad6265SDimitry Andric     // Create a vector of consecutive numbers from zero to VF.
930881ad6265SDimitry Andric     StartOffset = State.Builder.CreateAdd(
930981ad6265SDimitry Andric         StartOffset, State.Builder.CreateStepVector(VecPhiType));
931081ad6265SDimitry Andric 
9311fe013be4SDimitry Andric     assert(ScalarStepValue == State.get(getOperand(1), VPIteration(Part, 0)) &&
9312bdd1243dSDimitry Andric            "scalar step must be the same across all parts");
931381ad6265SDimitry Andric     Value *GEP = State.Builder.CreateGEP(
9314fe013be4SDimitry Andric         State.Builder.getInt8Ty(), NewPointerPhi,
931581ad6265SDimitry Andric         State.Builder.CreateMul(
931681ad6265SDimitry Andric             StartOffset,
931781ad6265SDimitry Andric             State.Builder.CreateVectorSplat(State.VF, ScalarStepValue),
931881ad6265SDimitry Andric             "vector.gep"));
931981ad6265SDimitry Andric     State.set(this, GEP, Part);
932081ad6265SDimitry Andric   }
932181ad6265SDimitry Andric }
932281ad6265SDimitry Andric 
execute(VPTransformState & State)9323bdd1243dSDimitry Andric void VPDerivedIVRecipe::execute(VPTransformState &State) {
9324bdd1243dSDimitry Andric   assert(!State.Instance && "VPDerivedIVRecipe being replicated.");
932581ad6265SDimitry Andric 
932681ad6265SDimitry Andric   // Fast-math-flags propagate from the original induction instruction.
932781ad6265SDimitry Andric   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9328c9157d92SDimitry Andric   if (FPBinOp)
9329c9157d92SDimitry Andric     State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
933081ad6265SDimitry Andric 
933181ad6265SDimitry Andric   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9332bdd1243dSDimitry Andric   Value *CanonicalIV = State.get(getCanonicalIV(), VPIteration(0, 0));
9333c9157d92SDimitry Andric   Value *DerivedIV = emitTransformedIndex(
9334c9157d92SDimitry Andric       State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step,
9335c9157d92SDimitry Andric       Kind, cast_if_present<BinaryOperator>(FPBinOp));
9336bdd1243dSDimitry Andric   DerivedIV->setName("offset.idx");
9337c9157d92SDimitry Andric   if (TruncResultTy) {
9338c9157d92SDimitry Andric     assert(TruncResultTy != DerivedIV->getType() &&
9339c9157d92SDimitry Andric            Step->getType()->isIntegerTy() &&
934081ad6265SDimitry Andric            "Truncation requires an integer step");
9341c9157d92SDimitry Andric     DerivedIV = State.Builder.CreateTrunc(DerivedIV, TruncResultTy);
934281ad6265SDimitry Andric   }
9343bdd1243dSDimitry Andric   assert(DerivedIV != CanonicalIV && "IV didn't need transforming?");
934481ad6265SDimitry Andric 
9345bdd1243dSDimitry Andric   State.set(this, DerivedIV, VPIteration(0, 0));
934681ad6265SDimitry Andric }
934781ad6265SDimitry Andric 
execute(VPTransformState & State)93480b57cec5SDimitry Andric void VPInterleaveRecipe::execute(VPTransformState &State) {
93490b57cec5SDimitry Andric   assert(!State.Instance && "Interleave group being replicated.");
9350e8d8bef9SDimitry Andric   State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9351fe013be4SDimitry Andric                                       getStoredValues(), getMask(),
9352fe013be4SDimitry Andric                                       NeedsMaskForGaps);
9353e8d8bef9SDimitry Andric }
9354e8d8bef9SDimitry Andric 
execute(VPTransformState & State)9355e8d8bef9SDimitry Andric void VPReductionRecipe::execute(VPTransformState &State) {
9356e8d8bef9SDimitry Andric   assert(!State.Instance && "Reduction being replicated.");
9357fe6060f1SDimitry Andric   Value *PrevInChain = State.get(getChainOp(), 0);
9358c9157d92SDimitry Andric   RecurKind Kind = RdxDesc.getRecurrenceKind();
9359c9157d92SDimitry Andric   bool IsOrdered = State.ILV->useOrderedReductions(RdxDesc);
9360349cc55cSDimitry Andric   // Propagate the fast-math flags carried by the underlying instruction.
9361349cc55cSDimitry Andric   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
9362c9157d92SDimitry Andric   State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
9363349cc55cSDimitry Andric   for (unsigned Part = 0; Part < State.UF; ++Part) {
9364e8d8bef9SDimitry Andric     Value *NewVecOp = State.get(getVecOp(), Part);
9365e8d8bef9SDimitry Andric     if (VPValue *Cond = getCondOp()) {
9366c9157d92SDimitry Andric       Value *NewCond = State.VF.isVector() ? State.get(Cond, Part)
9367c9157d92SDimitry Andric                                            : State.get(Cond, {Part, 0});
9368c9157d92SDimitry Andric       VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
9369c9157d92SDimitry Andric       Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
9370c9157d92SDimitry Andric       Value *Iden = RdxDesc.getRecurrenceIdentity(Kind, ElementTy,
9371c9157d92SDimitry Andric                                                   RdxDesc.getFastMathFlags());
9372c9157d92SDimitry Andric       if (State.VF.isVector()) {
9373c9157d92SDimitry Andric         Iden =
9374349cc55cSDimitry Andric             State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
9375c9157d92SDimitry Andric       }
9376c9157d92SDimitry Andric 
9377c9157d92SDimitry Andric       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Iden);
9378e8d8bef9SDimitry Andric       NewVecOp = Select;
9379e8d8bef9SDimitry Andric     }
9380fe6060f1SDimitry Andric     Value *NewRed;
9381e8d8bef9SDimitry Andric     Value *NextInChain;
9382fe6060f1SDimitry Andric     if (IsOrdered) {
9383fe6060f1SDimitry Andric       if (State.VF.isVector())
9384c9157d92SDimitry Andric         NewRed = createOrderedReduction(State.Builder, RdxDesc, NewVecOp,
9385fe6060f1SDimitry Andric                                         PrevInChain);
9386fe6060f1SDimitry Andric       else
9387fe6060f1SDimitry Andric         NewRed = State.Builder.CreateBinOp(
9388c9157d92SDimitry Andric             (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain,
9389349cc55cSDimitry Andric             NewVecOp);
9390fe6060f1SDimitry Andric       PrevInChain = NewRed;
9391fe6060f1SDimitry Andric     } else {
9392fe6060f1SDimitry Andric       PrevInChain = State.get(getChainOp(), Part);
9393c9157d92SDimitry Andric       NewRed = createTargetReduction(State.Builder, RdxDesc, NewVecOp);
9394fe6060f1SDimitry Andric     }
9395e8d8bef9SDimitry Andric     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9396c9157d92SDimitry Andric       NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(),
9397e8d8bef9SDimitry Andric                                    NewRed, PrevInChain);
9398fe6060f1SDimitry Andric     } else if (IsOrdered)
9399fe6060f1SDimitry Andric       NextInChain = NewRed;
9400349cc55cSDimitry Andric     else
9401e8d8bef9SDimitry Andric       NextInChain = State.Builder.CreateBinOp(
9402c9157d92SDimitry Andric           (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain);
9403fe6060f1SDimitry Andric     State.set(this, NextInChain, Part);
9404e8d8bef9SDimitry Andric   }
94050b57cec5SDimitry Andric }
94060b57cec5SDimitry Andric 
execute(VPTransformState & State)94070b57cec5SDimitry Andric void VPReplicateRecipe::execute(VPTransformState &State) {
9408bdd1243dSDimitry Andric   Instruction *UI = getUnderlyingInstr();
94090b57cec5SDimitry Andric   if (State.Instance) { // Generate a single instance.
9410e8d8bef9SDimitry Andric     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9411fe013be4SDimitry Andric     State.ILV->scalarizeInstruction(UI, this, *State.Instance, State);
94120b57cec5SDimitry Andric     // Insert scalar instance packing it into a vector.
9413fe013be4SDimitry Andric     if (State.VF.isVector() && shouldPack()) {
9414e8d8bef9SDimitry Andric       // If we're constructing lane 0, initialize to start from poison.
9415fe6060f1SDimitry Andric       if (State.Instance->Lane.isFirstLane()) {
9416e8d8bef9SDimitry Andric         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9417e8d8bef9SDimitry Andric         Value *Poison = PoisonValue::get(
9418bdd1243dSDimitry Andric             VectorType::get(UI->getType(), State.VF));
9419fe6060f1SDimitry Andric         State.set(this, Poison, State.Instance->Part);
94200b57cec5SDimitry Andric       }
9421c9157d92SDimitry Andric       State.packScalarIntoVectorValue(this, *State.Instance);
94220b57cec5SDimitry Andric     }
94230b57cec5SDimitry Andric     return;
94240b57cec5SDimitry Andric   }
94250b57cec5SDimitry Andric 
9426fcaf7f86SDimitry Andric   if (IsUniform) {
9427bdd1243dSDimitry Andric     // If the recipe is uniform across all parts (instead of just per VF), only
9428bdd1243dSDimitry Andric     // generate a single instance.
9429bdd1243dSDimitry Andric     if ((isa<LoadInst>(UI) || isa<StoreInst>(UI)) &&
9430bdd1243dSDimitry Andric         all_of(operands(), [](VPValue *Op) {
9431bdd1243dSDimitry Andric           return Op->isDefinedOutsideVectorRegions();
9432bdd1243dSDimitry Andric         })) {
9433fe013be4SDimitry Andric       State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), State);
9434bdd1243dSDimitry Andric       if (user_begin() != user_end()) {
9435bdd1243dSDimitry Andric         for (unsigned Part = 1; Part < State.UF; ++Part)
9436bdd1243dSDimitry Andric           State.set(this, State.get(this, VPIteration(0, 0)),
9437bdd1243dSDimitry Andric                     VPIteration(Part, 0));
9438bdd1243dSDimitry Andric       }
9439bdd1243dSDimitry Andric       return;
9440bdd1243dSDimitry Andric     }
9441bdd1243dSDimitry Andric 
9442fcaf7f86SDimitry Andric     // Uniform within VL means we need to generate lane 0 only for each
9443fcaf7f86SDimitry Andric     // unrolled copy.
9444fcaf7f86SDimitry Andric     for (unsigned Part = 0; Part < State.UF; ++Part)
9445fe013be4SDimitry Andric       State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0), State);
9446bdd1243dSDimitry Andric     return;
9447bdd1243dSDimitry Andric   }
9448bdd1243dSDimitry Andric 
9449fe013be4SDimitry Andric   // A store of a loop varying value to a uniform address only needs the last
9450fe013be4SDimitry Andric   // copy of the store.
9451fe013be4SDimitry Andric   if (isa<StoreInst>(UI) &&
9452fe013be4SDimitry Andric       vputils::isUniformAfterVectorization(getOperand(1))) {
9453bdd1243dSDimitry Andric     auto Lane = VPLane::getLastLaneForVF(State.VF);
9454fe013be4SDimitry Andric     State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane),
9455fcaf7f86SDimitry Andric                                     State);
9456fcaf7f86SDimitry Andric     return;
9457fcaf7f86SDimitry Andric   }
9458fcaf7f86SDimitry Andric 
9459fcaf7f86SDimitry Andric   // Generate scalar instances for all VF lanes of all UF parts.
9460fcaf7f86SDimitry Andric   assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9461fcaf7f86SDimitry Andric   const unsigned EndLane = State.VF.getKnownMinValue();
94620b57cec5SDimitry Andric   for (unsigned Part = 0; Part < State.UF; ++Part)
94630b57cec5SDimitry Andric     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9464fe013be4SDimitry Andric       State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State);
94650b57cec5SDimitry Andric }
94660b57cec5SDimitry Andric 
execute(VPTransformState & State)94670b57cec5SDimitry Andric void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
9468e8d8bef9SDimitry Andric   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
94694824e7fdSDimitry Andric 
94704824e7fdSDimitry Andric   // Attempt to issue a wide load.
94714824e7fdSDimitry Andric   LoadInst *LI = dyn_cast<LoadInst>(&Ingredient);
94724824e7fdSDimitry Andric   StoreInst *SI = dyn_cast<StoreInst>(&Ingredient);
94734824e7fdSDimitry Andric 
94744824e7fdSDimitry Andric   assert((LI || SI) && "Invalid Load/Store instruction");
94754824e7fdSDimitry Andric   assert((!SI || StoredValue) && "No stored value provided for widened store");
94764824e7fdSDimitry Andric   assert((!LI || !StoredValue) && "Stored value provided for widened load");
94774824e7fdSDimitry Andric 
94784824e7fdSDimitry Andric   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
94794824e7fdSDimitry Andric 
94804824e7fdSDimitry Andric   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
94814824e7fdSDimitry Andric   const Align Alignment = getLoadStoreAlignment(&Ingredient);
9482fe013be4SDimitry Andric   bool CreateGatherScatter = !isConsecutive();
94834824e7fdSDimitry Andric 
94844824e7fdSDimitry Andric   auto &Builder = State.Builder;
94854824e7fdSDimitry Andric   InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);
94864824e7fdSDimitry Andric   bool isMaskRequired = getMask();
9487c9157d92SDimitry Andric   if (isMaskRequired) {
9488de8261c4SDimitry Andric     // Mask reversal is only needed for non-all-one (null) masks, as reverse of
9489de8261c4SDimitry Andric     // a null all-one mask is a null mask.
9490c9157d92SDimitry Andric     for (unsigned Part = 0; Part < State.UF; ++Part) {
9491c9157d92SDimitry Andric       Value *Mask = State.get(getMask(), Part);
9492c9157d92SDimitry Andric       if (isReverse())
9493c9157d92SDimitry Andric         Mask = Builder.CreateVectorReverse(Mask, "reverse");
9494c9157d92SDimitry Andric       BlockInMaskParts[Part] = Mask;
9495c9157d92SDimitry Andric     }
9496c9157d92SDimitry Andric   }
94974824e7fdSDimitry Andric 
94984824e7fdSDimitry Andric   // Handle Stores:
94994824e7fdSDimitry Andric   if (SI) {
9500c9157d92SDimitry Andric     State.setDebugLocFrom(SI->getDebugLoc());
95014824e7fdSDimitry Andric 
95024824e7fdSDimitry Andric     for (unsigned Part = 0; Part < State.UF; ++Part) {
95034824e7fdSDimitry Andric       Instruction *NewSI = nullptr;
95044824e7fdSDimitry Andric       Value *StoredVal = State.get(StoredValue, Part);
95054824e7fdSDimitry Andric       if (CreateGatherScatter) {
95064824e7fdSDimitry Andric         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
95074824e7fdSDimitry Andric         Value *VectorGep = State.get(getAddr(), Part);
95084824e7fdSDimitry Andric         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
95094824e7fdSDimitry Andric                                             MaskPart);
95104824e7fdSDimitry Andric       } else {
9511fe013be4SDimitry Andric         if (isReverse()) {
95124824e7fdSDimitry Andric           // If we store to reverse consecutive memory locations, then we need
95134824e7fdSDimitry Andric           // to reverse the order of elements in the stored value.
95144824e7fdSDimitry Andric           StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
95154824e7fdSDimitry Andric           // We don't want to update the value in the map as it might be used in
95164824e7fdSDimitry Andric           // another expression. So don't call resetVectorValue(StoredVal).
95174824e7fdSDimitry Andric         }
9518de8261c4SDimitry Andric         auto *VecPtr = State.get(getAddr(), Part);
95194824e7fdSDimitry Andric         if (isMaskRequired)
95204824e7fdSDimitry Andric           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
95214824e7fdSDimitry Andric                                             BlockInMaskParts[Part]);
95224824e7fdSDimitry Andric         else
95234824e7fdSDimitry Andric           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
95244824e7fdSDimitry Andric       }
952581ad6265SDimitry Andric       State.addMetadata(NewSI, SI);
95264824e7fdSDimitry Andric     }
95274824e7fdSDimitry Andric     return;
95284824e7fdSDimitry Andric   }
95294824e7fdSDimitry Andric 
95304824e7fdSDimitry Andric   // Handle loads.
95314824e7fdSDimitry Andric   assert(LI && "Must have a load instruction");
9532c9157d92SDimitry Andric   State.setDebugLocFrom(LI->getDebugLoc());
95334824e7fdSDimitry Andric   for (unsigned Part = 0; Part < State.UF; ++Part) {
95344824e7fdSDimitry Andric     Value *NewLI;
95354824e7fdSDimitry Andric     if (CreateGatherScatter) {
95364824e7fdSDimitry Andric       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
95374824e7fdSDimitry Andric       Value *VectorGep = State.get(getAddr(), Part);
95384824e7fdSDimitry Andric       NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
95394824e7fdSDimitry Andric                                          nullptr, "wide.masked.gather");
954081ad6265SDimitry Andric       State.addMetadata(NewLI, LI);
95414824e7fdSDimitry Andric     } else {
9542de8261c4SDimitry Andric       auto *VecPtr = State.get(getAddr(), Part);
95434824e7fdSDimitry Andric       if (isMaskRequired)
95444824e7fdSDimitry Andric         NewLI = Builder.CreateMaskedLoad(
95454824e7fdSDimitry Andric             DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
95464824e7fdSDimitry Andric             PoisonValue::get(DataTy), "wide.masked.load");
95474824e7fdSDimitry Andric       else
95484824e7fdSDimitry Andric         NewLI =
95494824e7fdSDimitry Andric             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
95504824e7fdSDimitry Andric 
95514824e7fdSDimitry Andric       // Add metadata to the load, but setVectorValue to the reverse shuffle.
955281ad6265SDimitry Andric       State.addMetadata(NewLI, LI);
95534824e7fdSDimitry Andric       if (Reverse)
95544824e7fdSDimitry Andric         NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
95554824e7fdSDimitry Andric     }
95564824e7fdSDimitry Andric 
955781ad6265SDimitry Andric     State.set(getVPSingleValue(), NewLI, Part);
95584824e7fdSDimitry Andric   }
95590b57cec5SDimitry Andric }
95600b57cec5SDimitry Andric 
9561480093f4SDimitry Andric // Determine how to lower the scalar epilogue, which depends on 1) optimising
9562480093f4SDimitry Andric // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9563480093f4SDimitry Andric // predication, and 4) a TTI hook that analyses whether the loop is suitable
9564480093f4SDimitry Andric // for predication.
getScalarEpilogueLowering(Function * F,Loop * L,LoopVectorizeHints & Hints,ProfileSummaryInfo * PSI,BlockFrequencyInfo * BFI,TargetTransformInfo * TTI,TargetLibraryInfo * TLI,LoopVectorizationLegality & LVL,InterleavedAccessInfo * IAI)9565480093f4SDimitry Andric static ScalarEpilogueLowering getScalarEpilogueLowering(
9566480093f4SDimitry Andric     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
9567480093f4SDimitry Andric     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
9568bdd1243dSDimitry Andric     LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) {
9569480093f4SDimitry Andric   // 1) OptSize takes precedence over all other options, i.e. if this is set,
9570480093f4SDimitry Andric   // don't look at hints or options, and don't request a scalar epilogue.
9571e8d8bef9SDimitry Andric   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9572e8d8bef9SDimitry Andric   // LoopAccessInfo (due to code dependency and not being able to reliably get
9573e8d8bef9SDimitry Andric   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9574e8d8bef9SDimitry Andric   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9575e8d8bef9SDimitry Andric   // versioning when the vectorization is forced, unlike hasOptSize. So revert
9576e8d8bef9SDimitry Andric   // back to the old way and vectorize with versioning when forced. See D81345.)
9577e8d8bef9SDimitry Andric   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9578e8d8bef9SDimitry Andric                                                       PGSOQueryType::IRPass) &&
9579e8d8bef9SDimitry Andric                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
9580480093f4SDimitry Andric     return CM_ScalarEpilogueNotAllowedOptSize;
95818bcb0991SDimitry Andric 
9582e8d8bef9SDimitry Andric   // 2) If set, obey the directives
9583e8d8bef9SDimitry Andric   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9584e8d8bef9SDimitry Andric     switch (PreferPredicateOverEpilogue) {
9585e8d8bef9SDimitry Andric     case PreferPredicateTy::ScalarEpilogue:
9586480093f4SDimitry Andric       return CM_ScalarEpilogueAllowed;
9587e8d8bef9SDimitry Andric     case PreferPredicateTy::PredicateElseScalarEpilogue:
9588e8d8bef9SDimitry Andric       return CM_ScalarEpilogueNotNeededUsePredicate;
9589e8d8bef9SDimitry Andric     case PreferPredicateTy::PredicateOrDontVectorize:
9590e8d8bef9SDimitry Andric       return CM_ScalarEpilogueNotAllowedUsePredicate;
9591e8d8bef9SDimitry Andric     };
9592e8d8bef9SDimitry Andric   }
9593480093f4SDimitry Andric 
9594e8d8bef9SDimitry Andric   // 3) If set, obey the hints
9595e8d8bef9SDimitry Andric   switch (Hints.getPredicate()) {
9596e8d8bef9SDimitry Andric   case LoopVectorizeHints::FK_Enabled:
9597e8d8bef9SDimitry Andric     return CM_ScalarEpilogueNotNeededUsePredicate;
9598e8d8bef9SDimitry Andric   case LoopVectorizeHints::FK_Disabled:
9599e8d8bef9SDimitry Andric     return CM_ScalarEpilogueAllowed;
9600e8d8bef9SDimitry Andric   };
9601e8d8bef9SDimitry Andric 
9602e8d8bef9SDimitry Andric   // 4) if the TTI hook indicates this is profitable, request predication.
9603fe013be4SDimitry Andric   TailFoldingInfo TFI(TLI, &LVL, IAI);
9604fe013be4SDimitry Andric   if (TTI->preferPredicateOverEpilogue(&TFI))
9605480093f4SDimitry Andric     return CM_ScalarEpilogueNotNeededUsePredicate;
9606480093f4SDimitry Andric 
9607480093f4SDimitry Andric   return CM_ScalarEpilogueAllowed;
96088bcb0991SDimitry Andric }
96098bcb0991SDimitry Andric 
96100b57cec5SDimitry Andric // Process the loop in the VPlan-native vectorization path. This path builds
96110b57cec5SDimitry Andric // VPlan upfront in the vectorization pipeline, which allows to apply
96120b57cec5SDimitry Andric // VPlan-to-VPlan transformations from the very beginning without modifying the
96130b57cec5SDimitry Andric // input LLVM IR.
processLoopInVPlanNativePath(Loop * L,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,LoopVectorizationLegality * LVL,TargetTransformInfo * TTI,TargetLibraryInfo * TLI,DemandedBits * DB,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,LoopVectorizeHints & Hints,LoopVectorizationRequirements & Requirements)96140b57cec5SDimitry Andric static bool processLoopInVPlanNativePath(
96150b57cec5SDimitry Andric     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
96160b57cec5SDimitry Andric     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
96170b57cec5SDimitry Andric     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
96180b57cec5SDimitry Andric     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
9619fe6060f1SDimitry Andric     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
9620fe6060f1SDimitry Andric     LoopVectorizationRequirements &Requirements) {
96210b57cec5SDimitry Andric 
9622e8d8bef9SDimitry Andric   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
96235ffd83dbSDimitry Andric     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
96245ffd83dbSDimitry Andric     return false;
96255ffd83dbSDimitry Andric   }
96260b57cec5SDimitry Andric   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
96270b57cec5SDimitry Andric   Function *F = L->getHeader()->getParent();
96280b57cec5SDimitry Andric   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9629480093f4SDimitry Andric 
9630fe013be4SDimitry Andric   ScalarEpilogueLowering SEL =
9631fe013be4SDimitry Andric       getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
96328bcb0991SDimitry Andric 
96338bcb0991SDimitry Andric   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
96340b57cec5SDimitry Andric                                 &Hints, IAI);
96350b57cec5SDimitry Andric   // Use the planner for outer loop vectorization.
96360b57cec5SDimitry Andric   // TODO: CM is not used at this point inside the planner. Turn CM into an
96370b57cec5SDimitry Andric   // optional argument if we don't need it in the future.
9638c9157d92SDimitry Andric   LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
9639c9157d92SDimitry Andric                                ORE);
96400b57cec5SDimitry Andric 
96410b57cec5SDimitry Andric   // Get user vectorization factor.
9642e8d8bef9SDimitry Andric   ElementCount UserVF = Hints.getWidth();
96430b57cec5SDimitry Andric 
9644fe6060f1SDimitry Andric   CM.collectElementTypesForWidening();
9645fe6060f1SDimitry Andric 
96460b57cec5SDimitry Andric   // Plan how to best vectorize, return the best VF and its cost.
96478bcb0991SDimitry Andric   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
96480b57cec5SDimitry Andric 
96490b57cec5SDimitry Andric   // If we are stress testing VPlan builds, do not attempt to generate vector
96500b57cec5SDimitry Andric   // code. Masked vector code generation support will follow soon.
96510b57cec5SDimitry Andric   // Also, do not attempt to vectorize if no vector code will be produced.
965281ad6265SDimitry Andric   if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF)
96530b57cec5SDimitry Andric     return false;
96540b57cec5SDimitry Andric 
9655349cc55cSDimitry Andric   VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
96560b57cec5SDimitry Andric 
9657fe6060f1SDimitry Andric   {
9658c9157d92SDimitry Andric     bool AddBranchWeights =
9659c9157d92SDimitry Andric         hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
9660753f127fSDimitry Andric     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9661c9157d92SDimitry Andric                              F->getParent()->getDataLayout(), AddBranchWeights);
9662753f127fSDimitry Andric     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
9663753f127fSDimitry Andric                            VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
96640b57cec5SDimitry Andric     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
96650b57cec5SDimitry Andric                       << L->getHeader()->getParent()->getName() << "\"\n");
966681ad6265SDimitry Andric     LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
9667fe6060f1SDimitry Andric   }
96680b57cec5SDimitry Andric 
9669c9157d92SDimitry Andric   reportVectorization(ORE, L, VF, 1);
9670c9157d92SDimitry Andric 
96710b57cec5SDimitry Andric   // Mark the loop as already vectorized to avoid vectorizing again.
96720b57cec5SDimitry Andric   Hints.setAlreadyVectorized();
96735ffd83dbSDimitry Andric   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
96740b57cec5SDimitry Andric   return true;
96750b57cec5SDimitry Andric }
96760b57cec5SDimitry Andric 
9677fe6060f1SDimitry Andric // Emit a remark if there are stores to floats that required a floating point
9678fe6060f1SDimitry Andric // extension. If the vectorized loop was generated with floating point there
9679fe6060f1SDimitry Andric // will be a performance penalty from the conversion overhead and the change in
9680fe6060f1SDimitry Andric // the vector width.
checkMixedPrecision(Loop * L,OptimizationRemarkEmitter * ORE)9681fe6060f1SDimitry Andric static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
9682fe6060f1SDimitry Andric   SmallVector<Instruction *, 4> Worklist;
9683fe6060f1SDimitry Andric   for (BasicBlock *BB : L->getBlocks()) {
9684fe6060f1SDimitry Andric     for (Instruction &Inst : *BB) {
9685fe6060f1SDimitry Andric       if (auto *S = dyn_cast<StoreInst>(&Inst)) {
9686fe6060f1SDimitry Andric         if (S->getValueOperand()->getType()->isFloatTy())
9687fe6060f1SDimitry Andric           Worklist.push_back(S);
9688fe6060f1SDimitry Andric       }
9689fe6060f1SDimitry Andric     }
9690fe6060f1SDimitry Andric   }
9691fe6060f1SDimitry Andric 
9692fe6060f1SDimitry Andric   // Traverse the floating point stores upwards searching, for floating point
9693fe6060f1SDimitry Andric   // conversions.
9694fe6060f1SDimitry Andric   SmallPtrSet<const Instruction *, 4> Visited;
9695fe6060f1SDimitry Andric   SmallPtrSet<const Instruction *, 4> EmittedRemark;
9696fe6060f1SDimitry Andric   while (!Worklist.empty()) {
9697fe6060f1SDimitry Andric     auto *I = Worklist.pop_back_val();
9698fe6060f1SDimitry Andric     if (!L->contains(I))
9699fe6060f1SDimitry Andric       continue;
9700fe6060f1SDimitry Andric     if (!Visited.insert(I).second)
9701fe6060f1SDimitry Andric       continue;
9702fe6060f1SDimitry Andric 
9703fe6060f1SDimitry Andric     // Emit a remark if the floating point store required a floating
9704fe6060f1SDimitry Andric     // point conversion.
9705fe6060f1SDimitry Andric     // TODO: More work could be done to identify the root cause such as a
9706fe6060f1SDimitry Andric     // constant or a function return type and point the user to it.
9707fe6060f1SDimitry Andric     if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
9708fe6060f1SDimitry Andric       ORE->emit([&]() {
9709fe6060f1SDimitry Andric         return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
9710fe6060f1SDimitry Andric                                           I->getDebugLoc(), L->getHeader())
9711fe6060f1SDimitry Andric                << "floating point conversion changes vector width. "
9712fe6060f1SDimitry Andric                << "Mixed floating point precision requires an up/down "
9713fe6060f1SDimitry Andric                << "cast that will negatively impact performance.";
9714fe6060f1SDimitry Andric       });
9715fe6060f1SDimitry Andric 
9716fe6060f1SDimitry Andric     for (Use &Op : I->operands())
9717fe6060f1SDimitry Andric       if (auto *OpI = dyn_cast<Instruction>(Op))
9718fe6060f1SDimitry Andric         Worklist.push_back(OpI);
9719fe6060f1SDimitry Andric   }
9720fe6060f1SDimitry Andric }
9721fe6060f1SDimitry Andric 
areRuntimeChecksProfitable(GeneratedRTChecks & Checks,VectorizationFactor & VF,std::optional<unsigned> VScale,Loop * L,ScalarEvolution & SE,ScalarEpilogueLowering SEL)9722753f127fSDimitry Andric static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
9723753f127fSDimitry Andric                                        VectorizationFactor &VF,
9724bdd1243dSDimitry Andric                                        std::optional<unsigned> VScale, Loop *L,
9725c9157d92SDimitry Andric                                        ScalarEvolution &SE,
9726c9157d92SDimitry Andric                                        ScalarEpilogueLowering SEL) {
9727753f127fSDimitry Andric   InstructionCost CheckCost = Checks.getCost();
9728753f127fSDimitry Andric   if (!CheckCost.isValid())
9729753f127fSDimitry Andric     return false;
9730753f127fSDimitry Andric 
9731753f127fSDimitry Andric   // When interleaving only scalar and vector cost will be equal, which in turn
9732753f127fSDimitry Andric   // would lead to a divide by 0. Fall back to hard threshold.
9733753f127fSDimitry Andric   if (VF.Width.isScalar()) {
9734753f127fSDimitry Andric     if (CheckCost > VectorizeMemoryCheckThreshold) {
9735753f127fSDimitry Andric       LLVM_DEBUG(
9736753f127fSDimitry Andric           dbgs()
9737753f127fSDimitry Andric           << "LV: Interleaving only is not profitable due to runtime checks\n");
9738753f127fSDimitry Andric       return false;
9739753f127fSDimitry Andric     }
9740753f127fSDimitry Andric     return true;
9741753f127fSDimitry Andric   }
9742753f127fSDimitry Andric 
9743753f127fSDimitry Andric   // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
9744753f127fSDimitry Andric   double ScalarC = *VF.ScalarCost.getValue();
9745753f127fSDimitry Andric   if (ScalarC == 0)
9746753f127fSDimitry Andric     return true;
9747753f127fSDimitry Andric 
9748753f127fSDimitry Andric   // First, compute the minimum iteration count required so that the vector
9749753f127fSDimitry Andric   // loop outperforms the scalar loop.
9750753f127fSDimitry Andric   //  The total cost of the scalar loop is
9751753f127fSDimitry Andric   //   ScalarC * TC
9752753f127fSDimitry Andric   //  where
9753753f127fSDimitry Andric   //  * TC is the actual trip count of the loop.
9754753f127fSDimitry Andric   //  * ScalarC is the cost of a single scalar iteration.
9755753f127fSDimitry Andric   //
9756753f127fSDimitry Andric   //  The total cost of the vector loop is
9757753f127fSDimitry Andric   //    RtC + VecC * (TC / VF) + EpiC
9758753f127fSDimitry Andric   //  where
9759753f127fSDimitry Andric   //  * RtC is the cost of the generated runtime checks
9760753f127fSDimitry Andric   //  * VecC is the cost of a single vector iteration.
9761753f127fSDimitry Andric   //  * TC is the actual trip count of the loop
9762753f127fSDimitry Andric   //  * VF is the vectorization factor
9763753f127fSDimitry Andric   //  * EpiCost is the cost of the generated epilogue, including the cost
9764753f127fSDimitry Andric   //    of the remaining scalar operations.
9765753f127fSDimitry Andric   //
9766753f127fSDimitry Andric   // Vectorization is profitable once the total vector cost is less than the
9767753f127fSDimitry Andric   // total scalar cost:
9768753f127fSDimitry Andric   //   RtC + VecC * (TC / VF) + EpiC <  ScalarC * TC
9769753f127fSDimitry Andric   //
9770753f127fSDimitry Andric   // Now we can compute the minimum required trip count TC as
9771753f127fSDimitry Andric   //   (RtC + EpiC) / (ScalarC - (VecC / VF)) < TC
9772753f127fSDimitry Andric   //
9773753f127fSDimitry Andric   // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
9774753f127fSDimitry Andric   // the computations are performed on doubles, not integers and the result
9775753f127fSDimitry Andric   // is rounded up, hence we get an upper estimate of the TC.
9776753f127fSDimitry Andric   unsigned IntVF = VF.Width.getKnownMinValue();
9777753f127fSDimitry Andric   if (VF.Width.isScalable()) {
9778753f127fSDimitry Andric     unsigned AssumedMinimumVscale = 1;
9779753f127fSDimitry Andric     if (VScale)
9780753f127fSDimitry Andric       AssumedMinimumVscale = *VScale;
9781753f127fSDimitry Andric     IntVF *= AssumedMinimumVscale;
9782753f127fSDimitry Andric   }
9783753f127fSDimitry Andric   double VecCOverVF = double(*VF.Cost.getValue()) / IntVF;
9784753f127fSDimitry Andric   double RtC = *CheckCost.getValue();
9785753f127fSDimitry Andric   double MinTC1 = RtC / (ScalarC - VecCOverVF);
9786753f127fSDimitry Andric 
9787753f127fSDimitry Andric   // Second, compute a minimum iteration count so that the cost of the
9788753f127fSDimitry Andric   // runtime checks is only a fraction of the total scalar loop cost. This
9789753f127fSDimitry Andric   // adds a loop-dependent bound on the overhead incurred if the runtime
9790753f127fSDimitry Andric   // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
9791753f127fSDimitry Andric   // * TC. To bound the runtime check to be a fraction 1/X of the scalar
9792753f127fSDimitry Andric   // cost, compute
9793753f127fSDimitry Andric   //   RtC < ScalarC * TC * (1 / X)  ==>  RtC * X / ScalarC < TC
9794753f127fSDimitry Andric   double MinTC2 = RtC * 10 / ScalarC;
9795753f127fSDimitry Andric 
9796c9157d92SDimitry Andric   // Now pick the larger minimum. If it is not a multiple of VF and a scalar
9797c9157d92SDimitry Andric   // epilogue is allowed, choose the next closest multiple of VF. This should
9798c9157d92SDimitry Andric   // partly compensate for ignoring the epilogue cost.
9799753f127fSDimitry Andric   uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2));
9800c9157d92SDimitry Andric   if (SEL == CM_ScalarEpilogueAllowed)
9801c9157d92SDimitry Andric     MinTC = alignTo(MinTC, IntVF);
9802c9157d92SDimitry Andric   VF.MinProfitableTripCount = ElementCount::getFixed(MinTC);
9803753f127fSDimitry Andric 
9804753f127fSDimitry Andric   LLVM_DEBUG(
9805753f127fSDimitry Andric       dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
9806753f127fSDimitry Andric              << VF.MinProfitableTripCount << "\n");
9807753f127fSDimitry Andric 
9808753f127fSDimitry Andric   // Skip vectorization if the expected trip count is less than the minimum
9809753f127fSDimitry Andric   // required trip count.
9810753f127fSDimitry Andric   if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) {
9811753f127fSDimitry Andric     if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC),
9812753f127fSDimitry Andric                                 VF.MinProfitableTripCount)) {
9813753f127fSDimitry Andric       LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
9814753f127fSDimitry Andric                            "trip count < minimum profitable VF ("
9815753f127fSDimitry Andric                         << *ExpectedTC << " < " << VF.MinProfitableTripCount
9816753f127fSDimitry Andric                         << ")\n");
9817753f127fSDimitry Andric 
9818753f127fSDimitry Andric       return false;
9819753f127fSDimitry Andric     }
9820753f127fSDimitry Andric   }
9821753f127fSDimitry Andric   return true;
9822753f127fSDimitry Andric }
9823753f127fSDimitry Andric 
LoopVectorizePass(LoopVectorizeOptions Opts)98245ffd83dbSDimitry Andric LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
98255ffd83dbSDimitry Andric     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
98265ffd83dbSDimitry Andric                                !EnableLoopInterleaving),
98275ffd83dbSDimitry Andric       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
98285ffd83dbSDimitry Andric                               !EnableLoopVectorization) {}
98295ffd83dbSDimitry Andric 
processLoop(Loop * L)98300b57cec5SDimitry Andric bool LoopVectorizePass::processLoop(Loop *L) {
9831e8d8bef9SDimitry Andric   assert((EnableVPlanNativePath || L->isInnermost()) &&
98320b57cec5SDimitry Andric          "VPlan-native path is not enabled. Only process inner loops.");
98330b57cec5SDimitry Andric 
98340b57cec5SDimitry Andric #ifndef NDEBUG
98350b57cec5SDimitry Andric   const std::string DebugLocStr = getDebugLocString(L);
98360b57cec5SDimitry Andric #endif /* NDEBUG */
98370b57cec5SDimitry Andric 
983881ad6265SDimitry Andric   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
983981ad6265SDimitry Andric                     << L->getHeader()->getParent()->getName() << "' from "
98400b57cec5SDimitry Andric                     << DebugLocStr << "\n");
98410b57cec5SDimitry Andric 
98420eae32dcSDimitry Andric   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
98430b57cec5SDimitry Andric 
98440b57cec5SDimitry Andric   LLVM_DEBUG(
98450b57cec5SDimitry Andric       dbgs() << "LV: Loop hints:"
98460b57cec5SDimitry Andric              << " force="
98470b57cec5SDimitry Andric              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
98480b57cec5SDimitry Andric                      ? "disabled"
98490b57cec5SDimitry Andric                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
98500b57cec5SDimitry Andric                             ? "enabled"
98510b57cec5SDimitry Andric                             : "?"))
98520b57cec5SDimitry Andric              << " width=" << Hints.getWidth()
9853fe6060f1SDimitry Andric              << " interleave=" << Hints.getInterleave() << "\n");
98540b57cec5SDimitry Andric 
98550b57cec5SDimitry Andric   // Function containing loop
98560b57cec5SDimitry Andric   Function *F = L->getHeader()->getParent();
98570b57cec5SDimitry Andric 
98580b57cec5SDimitry Andric   // Looking at the diagnostic output is the only way to determine if a loop
98590b57cec5SDimitry Andric   // was vectorized (other than looking at the IR or machine code), so it
98600b57cec5SDimitry Andric   // is important to generate an optimization remark for each loop. Most of
98610b57cec5SDimitry Andric   // these messages are generated as OptimizationRemarkAnalysis. Remarks
98620b57cec5SDimitry Andric   // generated as OptimizationRemark and OptimizationRemarkMissed are
98630b57cec5SDimitry Andric   // less verbose reporting vectorized loops and unvectorized loops that may
98640b57cec5SDimitry Andric   // benefit from vectorization, respectively.
98650b57cec5SDimitry Andric 
98660b57cec5SDimitry Andric   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
98670b57cec5SDimitry Andric     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
98680b57cec5SDimitry Andric     return false;
98690b57cec5SDimitry Andric   }
98700b57cec5SDimitry Andric 
98710b57cec5SDimitry Andric   PredicatedScalarEvolution PSE(*SE, *L);
98720b57cec5SDimitry Andric 
98730b57cec5SDimitry Andric   // Check if it is legal to vectorize the loop.
9874fe6060f1SDimitry Andric   LoopVectorizationRequirements Requirements;
9875bdd1243dSDimitry Andric   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
9876e8d8bef9SDimitry Andric                                 &Requirements, &Hints, DB, AC, BFI, PSI);
98770b57cec5SDimitry Andric   if (!LVL.canVectorize(EnableVPlanNativePath)) {
98780b57cec5SDimitry Andric     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
98790b57cec5SDimitry Andric     Hints.emitRemarkWithHints();
98800b57cec5SDimitry Andric     return false;
98810b57cec5SDimitry Andric   }
98820b57cec5SDimitry Andric 
98830b57cec5SDimitry Andric   // Entrance to the VPlan-native vectorization path. Outer loops are processed
98840b57cec5SDimitry Andric   // here. They may require CFG and instruction level transformations before
98850b57cec5SDimitry Andric   // even evaluating whether vectorization is profitable. Since we cannot modify
98860b57cec5SDimitry Andric   // the incoming IR, we need to build VPlan upfront in the vectorization
98870b57cec5SDimitry Andric   // pipeline.
9888e8d8bef9SDimitry Andric   if (!L->isInnermost())
98890b57cec5SDimitry Andric     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
9890fe6060f1SDimitry Andric                                         ORE, BFI, PSI, Hints, Requirements);
98910b57cec5SDimitry Andric 
9892e8d8bef9SDimitry Andric   assert(L->isInnermost() && "Inner loop expected.");
98938bcb0991SDimitry Andric 
9894bdd1243dSDimitry Andric   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9895bdd1243dSDimitry Andric   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9896bdd1243dSDimitry Andric 
9897bdd1243dSDimitry Andric   // If an override option has been passed in for interleaved accesses, use it.
9898bdd1243dSDimitry Andric   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
9899bdd1243dSDimitry Andric     UseInterleaved = EnableInterleavedMemAccesses;
9900bdd1243dSDimitry Andric 
9901bdd1243dSDimitry Andric   // Analyze interleaved memory accesses.
9902bdd1243dSDimitry Andric   if (UseInterleaved)
9903bdd1243dSDimitry Andric     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
9904bdd1243dSDimitry Andric 
9905bdd1243dSDimitry Andric   // Check the function attributes and profiles to find out if this function
9906bdd1243dSDimitry Andric   // should be optimized for size.
9907fe013be4SDimitry Andric   ScalarEpilogueLowering SEL =
9908fe013be4SDimitry Andric       getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI);
9909bdd1243dSDimitry Andric 
99100b57cec5SDimitry Andric   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
99110b57cec5SDimitry Andric   // count by optimizing for size, to minimize overheads.
99128bcb0991SDimitry Andric   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
99138bcb0991SDimitry Andric   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
99140b57cec5SDimitry Andric     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
99150b57cec5SDimitry Andric                       << "This loop is worth vectorizing only if no scalar "
99160b57cec5SDimitry Andric                       << "iteration overheads are incurred.");
99170b57cec5SDimitry Andric     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
99180b57cec5SDimitry Andric       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
99190b57cec5SDimitry Andric     else {
992061cfbce3SDimitry Andric       if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
99210b57cec5SDimitry Andric         LLVM_DEBUG(dbgs() << "\n");
9922c9157d92SDimitry Andric         // Predicate tail-folded loops are efficient even when the loop
9923c9157d92SDimitry Andric         // iteration count is low. However, setting the epilogue policy to
9924c9157d92SDimitry Andric         // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
9925c9157d92SDimitry Andric         // with runtime checks. It's more effective to let
9926c9157d92SDimitry Andric         // `areRuntimeChecksProfitable` determine if vectorization is beneficial
9927c9157d92SDimitry Andric         // for the loop.
9928c9157d92SDimitry Andric         if (SEL != CM_ScalarEpilogueNotNeededUsePredicate)
99298bcb0991SDimitry Andric           SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
993061cfbce3SDimitry Andric       } else {
993161cfbce3SDimitry Andric         LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
993261cfbce3SDimitry Andric                              "small to consider vectorizing.\n");
993361cfbce3SDimitry Andric         reportVectorizationFailure(
993461cfbce3SDimitry Andric             "The trip count is below the minial threshold value.",
993561cfbce3SDimitry Andric             "loop trip count is too low, avoiding vectorization",
993661cfbce3SDimitry Andric             "LowTripCount", ORE, L);
993761cfbce3SDimitry Andric         Hints.emitRemarkWithHints();
993861cfbce3SDimitry Andric         return false;
993961cfbce3SDimitry Andric       }
99400b57cec5SDimitry Andric     }
99410b57cec5SDimitry Andric   }
99420b57cec5SDimitry Andric 
9943bdd1243dSDimitry Andric   // Check the function attributes to see if implicit floats or vectors are
9944bdd1243dSDimitry Andric   // allowed.
99450b57cec5SDimitry Andric   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
99468bcb0991SDimitry Andric     reportVectorizationFailure(
99478bcb0991SDimitry Andric         "Can't vectorize when the NoImplicitFloat attribute is used",
99488bcb0991SDimitry Andric         "loop not vectorized due to NoImplicitFloat attribute",
99498bcb0991SDimitry Andric         "NoImplicitFloat", ORE, L);
99500b57cec5SDimitry Andric     Hints.emitRemarkWithHints();
99510b57cec5SDimitry Andric     return false;
99520b57cec5SDimitry Andric   }
99530b57cec5SDimitry Andric 
99540b57cec5SDimitry Andric   // Check if the target supports potentially unsafe FP vectorization.
99550b57cec5SDimitry Andric   // FIXME: Add a check for the type of safety issue (denormal, signaling)
99560b57cec5SDimitry Andric   // for the target we're vectorizing for, to make sure none of the
99570b57cec5SDimitry Andric   // additional fp-math flags can help.
99580b57cec5SDimitry Andric   if (Hints.isPotentiallyUnsafe() &&
99590b57cec5SDimitry Andric       TTI->isFPVectorizationPotentiallyUnsafe()) {
99608bcb0991SDimitry Andric     reportVectorizationFailure(
99618bcb0991SDimitry Andric         "Potentially unsafe FP op prevents vectorization",
99628bcb0991SDimitry Andric         "loop not vectorized due to unsafe FP support.",
99638bcb0991SDimitry Andric         "UnsafeFP", ORE, L);
99640b57cec5SDimitry Andric     Hints.emitRemarkWithHints();
99650b57cec5SDimitry Andric     return false;
99660b57cec5SDimitry Andric   }
99670b57cec5SDimitry Andric 
9968349cc55cSDimitry Andric   bool AllowOrderedReductions;
9969349cc55cSDimitry Andric   // If the flag is set, use that instead and override the TTI behaviour.
9970349cc55cSDimitry Andric   if (ForceOrderedReductions.getNumOccurrences() > 0)
9971349cc55cSDimitry Andric     AllowOrderedReductions = ForceOrderedReductions;
9972349cc55cSDimitry Andric   else
9973349cc55cSDimitry Andric     AllowOrderedReductions = TTI->enableOrderedReductions();
9974349cc55cSDimitry Andric   if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
9975fe6060f1SDimitry Andric     ORE->emit([&]() {
9976fe6060f1SDimitry Andric       auto *ExactFPMathInst = Requirements.getExactFPInst();
9977fe6060f1SDimitry Andric       return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
9978fe6060f1SDimitry Andric                                                  ExactFPMathInst->getDebugLoc(),
9979fe6060f1SDimitry Andric                                                  ExactFPMathInst->getParent())
9980fe6060f1SDimitry Andric              << "loop not vectorized: cannot prove it is safe to reorder "
9981fe6060f1SDimitry Andric                 "floating-point operations";
9982fe6060f1SDimitry Andric     });
9983fe6060f1SDimitry Andric     LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
9984fe6060f1SDimitry Andric                          "reorder floating-point operations\n");
9985fe6060f1SDimitry Andric     Hints.emitRemarkWithHints();
9986fe6060f1SDimitry Andric     return false;
9987fe6060f1SDimitry Andric   }
9988fe6060f1SDimitry Andric 
99890b57cec5SDimitry Andric   // Use the cost model.
99908bcb0991SDimitry Andric   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
99918bcb0991SDimitry Andric                                 F, &Hints, IAI);
99920b57cec5SDimitry Andric   // Use the planner for vectorization.
9993c9157d92SDimitry Andric   LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
9994fe013be4SDimitry Andric                                ORE);
99950b57cec5SDimitry Andric 
99965ffd83dbSDimitry Andric   // Get user vectorization factor and interleave count.
9997e8d8bef9SDimitry Andric   ElementCount UserVF = Hints.getWidth();
99985ffd83dbSDimitry Andric   unsigned UserIC = Hints.getInterleave();
99990b57cec5SDimitry Andric 
100000b57cec5SDimitry Andric   // Plan how to best vectorize, return the best VF and its cost.
10001bdd1243dSDimitry Andric   std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
100020b57cec5SDimitry Andric 
100030b57cec5SDimitry Andric   VectorizationFactor VF = VectorizationFactor::Disabled();
100040b57cec5SDimitry Andric   unsigned IC = 1;
100050b57cec5SDimitry Andric 
10006c9157d92SDimitry Andric   bool AddBranchWeights =
10007c9157d92SDimitry Andric       hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
10008753f127fSDimitry Andric   GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
10009c9157d92SDimitry Andric                            F->getParent()->getDataLayout(), AddBranchWeights);
100100b57cec5SDimitry Andric   if (MaybeVF) {
10011753f127fSDimitry Andric     VF = *MaybeVF;
10012753f127fSDimitry Andric     // Select the interleave count.
10013bdd1243dSDimitry Andric     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
10014753f127fSDimitry Andric 
10015753f127fSDimitry Andric     unsigned SelectedIC = std::max(IC, UserIC);
10016753f127fSDimitry Andric     //  Optimistically generate runtime checks if they are needed. Drop them if
10017753f127fSDimitry Andric     //  they turn out to not be profitable.
10018753f127fSDimitry Andric     if (VF.Width.isVector() || SelectedIC > 1)
10019753f127fSDimitry Andric       Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
10020753f127fSDimitry Andric 
10021753f127fSDimitry Andric     // Check if it is profitable to vectorize with runtime checks.
10022753f127fSDimitry Andric     bool ForceVectorization =
10023753f127fSDimitry Andric         Hints.getForce() == LoopVectorizeHints::FK_Enabled;
10024753f127fSDimitry Andric     if (!ForceVectorization &&
10025fe013be4SDimitry Andric         !areRuntimeChecksProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L,
10026c9157d92SDimitry Andric                                     *PSE.getSE(), SEL)) {
1002781ad6265SDimitry Andric       ORE->emit([&]() {
1002881ad6265SDimitry Andric         return OptimizationRemarkAnalysisAliasing(
1002981ad6265SDimitry Andric                    DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
1003081ad6265SDimitry Andric                    L->getHeader())
1003181ad6265SDimitry Andric                << "loop not vectorized: cannot prove it is safe to reorder "
1003281ad6265SDimitry Andric                   "memory operations";
1003381ad6265SDimitry Andric       });
1003481ad6265SDimitry Andric       LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
1003581ad6265SDimitry Andric       Hints.emitRemarkWithHints();
1003681ad6265SDimitry Andric       return false;
1003781ad6265SDimitry Andric     }
100380b57cec5SDimitry Andric   }
100390b57cec5SDimitry Andric 
100400b57cec5SDimitry Andric   // Identify the diagnostic messages that should be produced.
100410b57cec5SDimitry Andric   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
100420b57cec5SDimitry Andric   bool VectorizeLoop = true, InterleaveLoop = true;
10043e8d8bef9SDimitry Andric   if (VF.Width.isScalar()) {
100440b57cec5SDimitry Andric     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
100450b57cec5SDimitry Andric     VecDiagMsg = std::make_pair(
100460b57cec5SDimitry Andric         "VectorizationNotBeneficial",
100470b57cec5SDimitry Andric         "the cost-model indicates that vectorization is not beneficial");
100480b57cec5SDimitry Andric     VectorizeLoop = false;
100490b57cec5SDimitry Andric   }
100500b57cec5SDimitry Andric 
100510b57cec5SDimitry Andric   if (!MaybeVF && UserIC > 1) {
100520b57cec5SDimitry Andric     // Tell the user interleaving was avoided up-front, despite being explicitly
100530b57cec5SDimitry Andric     // requested.
100540b57cec5SDimitry Andric     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
100550b57cec5SDimitry Andric                          "interleaving should be avoided up front\n");
100560b57cec5SDimitry Andric     IntDiagMsg = std::make_pair(
100570b57cec5SDimitry Andric         "InterleavingAvoided",
100580b57cec5SDimitry Andric         "Ignoring UserIC, because interleaving was avoided up front");
100590b57cec5SDimitry Andric     InterleaveLoop = false;
100600b57cec5SDimitry Andric   } else if (IC == 1 && UserIC <= 1) {
100610b57cec5SDimitry Andric     // Tell the user interleaving is not beneficial.
100620b57cec5SDimitry Andric     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
100630b57cec5SDimitry Andric     IntDiagMsg = std::make_pair(
100640b57cec5SDimitry Andric         "InterleavingNotBeneficial",
100650b57cec5SDimitry Andric         "the cost-model indicates that interleaving is not beneficial");
100660b57cec5SDimitry Andric     InterleaveLoop = false;
100670b57cec5SDimitry Andric     if (UserIC == 1) {
100680b57cec5SDimitry Andric       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
100690b57cec5SDimitry Andric       IntDiagMsg.second +=
100700b57cec5SDimitry Andric           " and is explicitly disabled or interleave count is set to 1";
100710b57cec5SDimitry Andric     }
100720b57cec5SDimitry Andric   } else if (IC > 1 && UserIC == 1) {
100730b57cec5SDimitry Andric     // Tell the user interleaving is beneficial, but it explicitly disabled.
100740b57cec5SDimitry Andric     LLVM_DEBUG(
100750b57cec5SDimitry Andric         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
100760b57cec5SDimitry Andric     IntDiagMsg = std::make_pair(
100770b57cec5SDimitry Andric         "InterleavingBeneficialButDisabled",
100780b57cec5SDimitry Andric         "the cost-model indicates that interleaving is beneficial "
100790b57cec5SDimitry Andric         "but is explicitly disabled or interleave count is set to 1");
100800b57cec5SDimitry Andric     InterleaveLoop = false;
100810b57cec5SDimitry Andric   }
100820b57cec5SDimitry Andric 
100830b57cec5SDimitry Andric   // Override IC if user provided an interleave count.
100840b57cec5SDimitry Andric   IC = UserIC > 0 ? UserIC : IC;
100850b57cec5SDimitry Andric 
100860b57cec5SDimitry Andric   // Emit diagnostic messages, if any.
100870b57cec5SDimitry Andric   const char *VAPassName = Hints.vectorizeAnalysisPassName();
100880b57cec5SDimitry Andric   if (!VectorizeLoop && !InterleaveLoop) {
100890b57cec5SDimitry Andric     // Do not vectorize or interleaving the loop.
100900b57cec5SDimitry Andric     ORE->emit([&]() {
100910b57cec5SDimitry Andric       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
100920b57cec5SDimitry Andric                                       L->getStartLoc(), L->getHeader())
100930b57cec5SDimitry Andric              << VecDiagMsg.second;
100940b57cec5SDimitry Andric     });
100950b57cec5SDimitry Andric     ORE->emit([&]() {
100960b57cec5SDimitry Andric       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
100970b57cec5SDimitry Andric                                       L->getStartLoc(), L->getHeader())
100980b57cec5SDimitry Andric              << IntDiagMsg.second;
100990b57cec5SDimitry Andric     });
101000b57cec5SDimitry Andric     return false;
101010b57cec5SDimitry Andric   } else if (!VectorizeLoop && InterleaveLoop) {
101020b57cec5SDimitry Andric     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
101030b57cec5SDimitry Andric     ORE->emit([&]() {
101040b57cec5SDimitry Andric       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
101050b57cec5SDimitry Andric                                         L->getStartLoc(), L->getHeader())
101060b57cec5SDimitry Andric              << VecDiagMsg.second;
101070b57cec5SDimitry Andric     });
101080b57cec5SDimitry Andric   } else if (VectorizeLoop && !InterleaveLoop) {
101090b57cec5SDimitry Andric     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
101100b57cec5SDimitry Andric                       << ") in " << DebugLocStr << '\n');
101110b57cec5SDimitry Andric     ORE->emit([&]() {
101120b57cec5SDimitry Andric       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
101130b57cec5SDimitry Andric                                         L->getStartLoc(), L->getHeader())
101140b57cec5SDimitry Andric              << IntDiagMsg.second;
101150b57cec5SDimitry Andric     });
101160b57cec5SDimitry Andric   } else if (VectorizeLoop && InterleaveLoop) {
101170b57cec5SDimitry Andric     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
101180b57cec5SDimitry Andric                       << ") in " << DebugLocStr << '\n');
101190b57cec5SDimitry Andric     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
101200b57cec5SDimitry Andric   }
101210b57cec5SDimitry Andric 
10122fe6060f1SDimitry Andric   bool DisableRuntimeUnroll = false;
10123fe6060f1SDimitry Andric   MDNode *OrigLoopID = L->getLoopID();
10124fe6060f1SDimitry Andric   {
101250b57cec5SDimitry Andric     using namespace ore;
101260b57cec5SDimitry Andric     if (!VectorizeLoop) {
101270b57cec5SDimitry Andric       assert(IC > 1 && "interleave count should not be 1 or 0");
101280b57cec5SDimitry Andric       // If we decided that it is not legal to vectorize the loop, then
101290b57cec5SDimitry Andric       // interleave it.
10130fe6060f1SDimitry Andric       InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
10131fe6060f1SDimitry Andric                                  &CM, BFI, PSI, Checks);
10132349cc55cSDimitry Andric 
10133349cc55cSDimitry Andric       VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
1013481ad6265SDimitry Andric       LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
101350b57cec5SDimitry Andric 
101360b57cec5SDimitry Andric       ORE->emit([&]() {
101370b57cec5SDimitry Andric         return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
101380b57cec5SDimitry Andric                                   L->getHeader())
101390b57cec5SDimitry Andric                << "interleaved loop (interleaved count: "
101400b57cec5SDimitry Andric                << NV("InterleaveCount", IC) << ")";
101410b57cec5SDimitry Andric       });
101420b57cec5SDimitry Andric     } else {
101430b57cec5SDimitry Andric       // If we decided that it is *legal* to vectorize the loop, then do it.
10144e8d8bef9SDimitry Andric 
10145e8d8bef9SDimitry Andric       // Consider vectorizing the epilogue too if it's profitable.
10146e8d8bef9SDimitry Andric       VectorizationFactor EpilogueVF =
10147fe013be4SDimitry Andric           LVP.selectEpilogueVectorizationFactor(VF.Width, IC);
10148e8d8bef9SDimitry Andric       if (EpilogueVF.Width.isVector()) {
10149e8d8bef9SDimitry Andric 
10150e8d8bef9SDimitry Andric         // The first pass vectorizes the main loop and creates a scalar epilogue
10151e8d8bef9SDimitry Andric         // to be vectorized by executing the plan (potentially with a different
10152e8d8bef9SDimitry Andric         // factor) again shortly afterwards.
10153349cc55cSDimitry Andric         EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
10154fe6060f1SDimitry Andric         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10155fe6060f1SDimitry Andric                                            EPI, &LVL, &CM, BFI, PSI, Checks);
10156e8d8bef9SDimitry Andric 
10157349cc55cSDimitry Andric         VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
10158cdc20ff6SDimitry Andric         const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan(
10159cdc20ff6SDimitry Andric             EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, DT, true);
10160e8d8bef9SDimitry Andric         ++LoopsVectorized;
10161e8d8bef9SDimitry Andric 
10162e8d8bef9SDimitry Andric         // Second pass vectorizes the epilogue and adjusts the control flow
10163e8d8bef9SDimitry Andric         // edges from the first pass.
10164e8d8bef9SDimitry Andric         EPI.MainLoopVF = EPI.EpilogueVF;
10165e8d8bef9SDimitry Andric         EPI.MainLoopUF = EPI.EpilogueUF;
10166e8d8bef9SDimitry Andric         EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10167fe6060f1SDimitry Andric                                                  ORE, EPI, &LVL, &CM, BFI, PSI,
10168fe6060f1SDimitry Andric                                                  Checks);
10169349cc55cSDimitry Andric 
10170349cc55cSDimitry Andric         VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
1017181ad6265SDimitry Andric         VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion();
1017281ad6265SDimitry Andric         VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
1017381ad6265SDimitry Andric         Header->setName("vec.epilog.vector.body");
1017404eeddc0SDimitry Andric 
10175fe013be4SDimitry Andric         // Re-use the trip count and steps expanded for the main loop, as
10176fe013be4SDimitry Andric         // skeleton creation needs it as a value that dominates both the scalar
10177fe013be4SDimitry Andric         // and vector epilogue loops
10178fe013be4SDimitry Andric         // TODO: This is a workaround needed for epilogue vectorization and it
10179fe013be4SDimitry Andric         // should be removed once induction resume value creation is done
10180fe013be4SDimitry Andric         // directly in VPlan.
10181fe013be4SDimitry Andric         EpilogILV.setTripCount(MainILV.getTripCount());
10182fe013be4SDimitry Andric         for (auto &R : make_early_inc_range(*BestEpiPlan.getPreheader())) {
10183fe013be4SDimitry Andric           auto *ExpandR = cast<VPExpandSCEVRecipe>(&R);
10184fe013be4SDimitry Andric           auto *ExpandedVal = BestEpiPlan.getVPValueOrAddLiveIn(
10185fe013be4SDimitry Andric               ExpandedSCEVs.find(ExpandR->getSCEV())->second);
10186fe013be4SDimitry Andric           ExpandR->replaceAllUsesWith(ExpandedVal);
10187fe013be4SDimitry Andric           ExpandR->eraseFromParent();
10188fe013be4SDimitry Andric         }
10189fe013be4SDimitry Andric 
10190bdd1243dSDimitry Andric         // Ensure that the start values for any VPWidenIntOrFpInductionRecipe,
10191bdd1243dSDimitry Andric         // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated
10192bdd1243dSDimitry Andric         // before vectorizing the epilogue loop.
1019304eeddc0SDimitry Andric         for (VPRecipeBase &R : Header->phis()) {
10194bdd1243dSDimitry Andric           if (isa<VPCanonicalIVPHIRecipe>(&R))
10195bdd1243dSDimitry Andric             continue;
10196bdd1243dSDimitry Andric 
10197bdd1243dSDimitry Andric           Value *ResumeV = nullptr;
10198bdd1243dSDimitry Andric           // TODO: Move setting of resume values to prepareToExecute.
1019904eeddc0SDimitry Andric           if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10200cdc20ff6SDimitry Andric             ResumeV = ReductionResumeValues
10201cdc20ff6SDimitry Andric                           .find(&ReductionPhi->getRecurrenceDescriptor())
10202cdc20ff6SDimitry Andric                           ->second;
10203bdd1243dSDimitry Andric           } else {
10204bdd1243dSDimitry Andric             // Create induction resume values for both widened pointer and
10205bdd1243dSDimitry Andric             // integer/fp inductions and update the start value of the induction
10206bdd1243dSDimitry Andric             // recipes to use the resume value.
10207bdd1243dSDimitry Andric             PHINode *IndPhi = nullptr;
10208bdd1243dSDimitry Andric             const InductionDescriptor *ID;
10209bdd1243dSDimitry Andric             if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
10210bdd1243dSDimitry Andric               IndPhi = cast<PHINode>(Ind->getUnderlyingValue());
10211bdd1243dSDimitry Andric               ID = &Ind->getInductionDescriptor();
10212bdd1243dSDimitry Andric             } else {
10213bdd1243dSDimitry Andric               auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R);
10214bdd1243dSDimitry Andric               IndPhi = WidenInd->getPHINode();
10215bdd1243dSDimitry Andric               ID = &WidenInd->getInductionDescriptor();
1021604eeddc0SDimitry Andric             }
10217bdd1243dSDimitry Andric 
10218bdd1243dSDimitry Andric             ResumeV = MainILV.createInductionResumeValue(
10219fe013be4SDimitry Andric                 IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs),
10220fe013be4SDimitry Andric                 {EPI.MainLoopIterationCountCheck});
1022104eeddc0SDimitry Andric           }
10222bdd1243dSDimitry Andric           assert(ResumeV && "Must have a resume value");
10223fe013be4SDimitry Andric           VPValue *StartVal = BestEpiPlan.getVPValueOrAddLiveIn(ResumeV);
10224bdd1243dSDimitry Andric           cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
1022504eeddc0SDimitry Andric         }
1022604eeddc0SDimitry Andric 
10227349cc55cSDimitry Andric         LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10228fe013be4SDimitry Andric                         DT, true, &ExpandedSCEVs);
10229e8d8bef9SDimitry Andric         ++LoopsEpilogueVectorized;
10230e8d8bef9SDimitry Andric 
10231e8d8bef9SDimitry Andric         if (!MainILV.areSafetyChecksAdded())
10232e8d8bef9SDimitry Andric           DisableRuntimeUnroll = true;
10233e8d8bef9SDimitry Andric       } else {
10234753f127fSDimitry Andric         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10235753f127fSDimitry Andric                                VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
10236753f127fSDimitry Andric                                PSI, Checks);
10237349cc55cSDimitry Andric 
10238349cc55cSDimitry Andric         VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
1023981ad6265SDimitry Andric         LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
102400b57cec5SDimitry Andric         ++LoopsVectorized;
102410b57cec5SDimitry Andric 
10242fe6060f1SDimitry Andric         // Add metadata to disable runtime unrolling a scalar loop when there
10243fe6060f1SDimitry Andric         // are no runtime checks about strides and memory. A scalar loop that is
102440b57cec5SDimitry Andric         // rarely used is not worth unrolling.
102450b57cec5SDimitry Andric         if (!LB.areSafetyChecksAdded())
102460b57cec5SDimitry Andric           DisableRuntimeUnroll = true;
10247e8d8bef9SDimitry Andric       }
102480b57cec5SDimitry Andric       // Report the vectorization decision.
10249c9157d92SDimitry Andric       reportVectorization(ORE, L, VF, IC);
102500b57cec5SDimitry Andric     }
102510b57cec5SDimitry Andric 
10252fe6060f1SDimitry Andric     if (ORE->allowExtraAnalysis(LV_NAME))
10253fe6060f1SDimitry Andric       checkMixedPrecision(L, ORE);
10254fe6060f1SDimitry Andric   }
10255fe6060f1SDimitry Andric 
10256bdd1243dSDimitry Andric   std::optional<MDNode *> RemainderLoopID =
102570b57cec5SDimitry Andric       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
102580b57cec5SDimitry Andric                                       LLVMLoopVectorizeFollowupEpilogue});
1025981ad6265SDimitry Andric   if (RemainderLoopID) {
10260bdd1243dSDimitry Andric     L->setLoopID(*RemainderLoopID);
102610b57cec5SDimitry Andric   } else {
102620b57cec5SDimitry Andric     if (DisableRuntimeUnroll)
102630b57cec5SDimitry Andric       AddRuntimeUnrollDisableMetaData(L);
102640b57cec5SDimitry Andric 
102650b57cec5SDimitry Andric     // Mark the loop as already vectorized to avoid vectorizing again.
102660b57cec5SDimitry Andric     Hints.setAlreadyVectorized();
102670b57cec5SDimitry Andric   }
102680b57cec5SDimitry Andric 
102695ffd83dbSDimitry Andric   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
102700b57cec5SDimitry Andric   return true;
102710b57cec5SDimitry Andric }
102720b57cec5SDimitry Andric 
runImpl(Function & F,ScalarEvolution & SE_,LoopInfo & LI_,TargetTransformInfo & TTI_,DominatorTree & DT_,BlockFrequencyInfo * BFI_,TargetLibraryInfo * TLI_,DemandedBits & DB_,AssumptionCache & AC_,LoopAccessInfoManager & LAIs_,OptimizationRemarkEmitter & ORE_,ProfileSummaryInfo * PSI_)102735ffd83dbSDimitry Andric LoopVectorizeResult LoopVectorizePass::runImpl(
102740b57cec5SDimitry Andric     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
10275fe013be4SDimitry Andric     DominatorTree &DT_, BlockFrequencyInfo *BFI_, TargetLibraryInfo *TLI_,
10276bdd1243dSDimitry Andric     DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_,
102770b57cec5SDimitry Andric     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
102780b57cec5SDimitry Andric   SE = &SE_;
102790b57cec5SDimitry Andric   LI = &LI_;
102800b57cec5SDimitry Andric   TTI = &TTI_;
102810b57cec5SDimitry Andric   DT = &DT_;
10282fe013be4SDimitry Andric   BFI = BFI_;
102830b57cec5SDimitry Andric   TLI = TLI_;
102840b57cec5SDimitry Andric   AC = &AC_;
10285bdd1243dSDimitry Andric   LAIs = &LAIs_;
102860b57cec5SDimitry Andric   DB = &DB_;
102870b57cec5SDimitry Andric   ORE = &ORE_;
102880b57cec5SDimitry Andric   PSI = PSI_;
102890b57cec5SDimitry Andric 
102900b57cec5SDimitry Andric   // Don't attempt if
102910b57cec5SDimitry Andric   // 1. the target claims to have no vector registers, and
102920b57cec5SDimitry Andric   // 2. interleaving won't help ILP.
102930b57cec5SDimitry Andric   //
102940b57cec5SDimitry Andric   // The second condition is necessary because, even if the target has no
102950b57cec5SDimitry Andric   // vector registers, loop vectorization may still enable scalar
102960b57cec5SDimitry Andric   // interleaving.
102978bcb0991SDimitry Andric   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10298fe013be4SDimitry Andric       TTI->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2)
102995ffd83dbSDimitry Andric     return LoopVectorizeResult(false, false);
103000b57cec5SDimitry Andric 
103015ffd83dbSDimitry Andric   bool Changed = false, CFGChanged = false;
103020b57cec5SDimitry Andric 
103030b57cec5SDimitry Andric   // The vectorizer requires loops to be in simplified form.
103040b57cec5SDimitry Andric   // Since simplification may add new inner loops, it has to run before the
103050b57cec5SDimitry Andric   // legality and profitability checks. This means running the loop vectorizer
103060b57cec5SDimitry Andric   // will simplify all loops, regardless of whether anything end up being
103070b57cec5SDimitry Andric   // vectorized.
10308bdd1243dSDimitry Andric   for (const auto &L : *LI)
103095ffd83dbSDimitry Andric     Changed |= CFGChanged |=
103100b57cec5SDimitry Andric         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
103110b57cec5SDimitry Andric 
103120b57cec5SDimitry Andric   // Build up a worklist of inner-loops to vectorize. This is necessary as
103130b57cec5SDimitry Andric   // the act of vectorizing or partially unrolling a loop creates new loops
103140b57cec5SDimitry Andric   // and can invalidate iterators across the loops.
103150b57cec5SDimitry Andric   SmallVector<Loop *, 8> Worklist;
103160b57cec5SDimitry Andric 
103170b57cec5SDimitry Andric   for (Loop *L : *LI)
103180b57cec5SDimitry Andric     collectSupportedLoops(*L, LI, ORE, Worklist);
103190b57cec5SDimitry Andric 
103200b57cec5SDimitry Andric   LoopsAnalyzed += Worklist.size();
103210b57cec5SDimitry Andric 
103220b57cec5SDimitry Andric   // Now walk the identified inner loops.
103230b57cec5SDimitry Andric   while (!Worklist.empty()) {
103240b57cec5SDimitry Andric     Loop *L = Worklist.pop_back_val();
103250b57cec5SDimitry Andric 
103260b57cec5SDimitry Andric     // For the inner loops we actually process, form LCSSA to simplify the
103270b57cec5SDimitry Andric     // transform.
103280b57cec5SDimitry Andric     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
103290b57cec5SDimitry Andric 
103305ffd83dbSDimitry Andric     Changed |= CFGChanged |= processLoop(L);
10331bdd1243dSDimitry Andric 
10332c9157d92SDimitry Andric     if (Changed) {
10333bdd1243dSDimitry Andric       LAIs->clear();
10334c9157d92SDimitry Andric 
10335c9157d92SDimitry Andric #ifndef NDEBUG
10336c9157d92SDimitry Andric       if (VerifySCEV)
10337c9157d92SDimitry Andric         SE->verify();
10338c9157d92SDimitry Andric #endif
10339c9157d92SDimitry Andric     }
103400b57cec5SDimitry Andric   }
103410b57cec5SDimitry Andric 
103420b57cec5SDimitry Andric   // Process each loop nest in the function.
103435ffd83dbSDimitry Andric   return LoopVectorizeResult(Changed, CFGChanged);
103440b57cec5SDimitry Andric }
103450b57cec5SDimitry Andric 
run(Function & F,FunctionAnalysisManager & AM)103460b57cec5SDimitry Andric PreservedAnalyses LoopVectorizePass::run(Function &F,
103470b57cec5SDimitry Andric                                          FunctionAnalysisManager &AM) {
103480b57cec5SDimitry Andric     auto &LI = AM.getResult<LoopAnalysis>(F);
1034981ad6265SDimitry Andric     // There are no loops in the function. Return before computing other expensive
1035081ad6265SDimitry Andric     // analyses.
1035181ad6265SDimitry Andric     if (LI.empty())
1035281ad6265SDimitry Andric       return PreservedAnalyses::all();
1035381ad6265SDimitry Andric     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
103540b57cec5SDimitry Andric     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
103550b57cec5SDimitry Andric     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
103560b57cec5SDimitry Andric     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
103570b57cec5SDimitry Andric     auto &AC = AM.getResult<AssumptionAnalysis>(F);
103580b57cec5SDimitry Andric     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
103590b57cec5SDimitry Andric     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
103600b57cec5SDimitry Andric 
10361bdd1243dSDimitry Andric     LoopAccessInfoManager &LAIs = AM.getResult<LoopAccessAnalysis>(F);
103625ffd83dbSDimitry Andric     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
103630b57cec5SDimitry Andric     ProfileSummaryInfo *PSI =
103645ffd83dbSDimitry Andric         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10365fe013be4SDimitry Andric     BlockFrequencyInfo *BFI = nullptr;
10366fe013be4SDimitry Andric     if (PSI && PSI->hasProfileSummary())
10367fe013be4SDimitry Andric       BFI = &AM.getResult<BlockFrequencyAnalysis>(F);
103685ffd83dbSDimitry Andric     LoopVectorizeResult Result =
10369bdd1243dSDimitry Andric         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AC, LAIs, ORE, PSI);
103705ffd83dbSDimitry Andric     if (!Result.MadeAnyChange)
103710b57cec5SDimitry Andric       return PreservedAnalyses::all();
103720b57cec5SDimitry Andric     PreservedAnalyses PA;
103730b57cec5SDimitry Andric 
10374fe013be4SDimitry Andric     if (isAssignmentTrackingEnabled(*F.getParent())) {
10375fe013be4SDimitry Andric       for (auto &BB : F)
10376fe013be4SDimitry Andric         RemoveRedundantDbgInstrs(&BB);
10377fe013be4SDimitry Andric     }
10378fe013be4SDimitry Andric 
103790b57cec5SDimitry Andric     // We currently do not preserve loopinfo/dominator analyses with outer loop
103800b57cec5SDimitry Andric     // vectorization. Until this is addressed, mark these analyses as preserved
103810b57cec5SDimitry Andric     // only for non-VPlan-native path.
103820b57cec5SDimitry Andric     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
103830b57cec5SDimitry Andric     if (!EnableVPlanNativePath) {
103840b57cec5SDimitry Andric       PA.preserve<LoopAnalysis>();
103850b57cec5SDimitry Andric       PA.preserve<DominatorTreeAnalysis>();
10386fe013be4SDimitry Andric       PA.preserve<ScalarEvolutionAnalysis>();
103870b57cec5SDimitry Andric     }
103880eae32dcSDimitry Andric 
103890eae32dcSDimitry Andric     if (Result.MadeCFGChange) {
103900eae32dcSDimitry Andric       // Making CFG changes likely means a loop got vectorized. Indicate that
103910eae32dcSDimitry Andric       // extra simplification passes should be run.
103920eae32dcSDimitry Andric       // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
103930eae32dcSDimitry Andric       // be run if runtime checks have been added.
103940eae32dcSDimitry Andric       AM.getResult<ShouldRunExtraVectorPasses>(F);
103950eae32dcSDimitry Andric       PA.preserve<ShouldRunExtraVectorPasses>();
103960eae32dcSDimitry Andric     } else {
103975ffd83dbSDimitry Andric       PA.preserveSet<CFGAnalyses>();
103980eae32dcSDimitry Andric     }
103990b57cec5SDimitry Andric     return PA;
104000b57cec5SDimitry Andric }
10401349cc55cSDimitry Andric 
printPipeline(raw_ostream & OS,function_ref<StringRef (StringRef)> MapClassName2PassName)10402349cc55cSDimitry Andric void LoopVectorizePass::printPipeline(
10403349cc55cSDimitry Andric     raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10404349cc55cSDimitry Andric   static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10405349cc55cSDimitry Andric       OS, MapClassName2PassName);
10406349cc55cSDimitry Andric 
10407fe013be4SDimitry Andric   OS << '<';
10408349cc55cSDimitry Andric   OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10409349cc55cSDimitry Andric   OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10410fe013be4SDimitry Andric   OS << '>';
10411349cc55cSDimitry Andric }
10412