10b57cec5SDimitry Andric //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric //
90b57cec5SDimitry Andric // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
100b57cec5SDimitry Andric // and generates target-independent LLVM-IR.
110b57cec5SDimitry Andric // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
120b57cec5SDimitry Andric // of instructions in order to estimate the profitability of vectorization.
130b57cec5SDimitry Andric //
140b57cec5SDimitry Andric // The loop vectorizer combines consecutive loop iterations into a single
150b57cec5SDimitry Andric // 'wide' iteration. After this transformation the index is incremented
160b57cec5SDimitry Andric // by the SIMD vector width, and not by one.
170b57cec5SDimitry Andric //
180b57cec5SDimitry Andric // This pass has three parts:
190b57cec5SDimitry Andric // 1. The main loop pass that drives the different parts.
200b57cec5SDimitry Andric // 2. LoopVectorizationLegality - A unit that checks for the legality
210b57cec5SDimitry Andric // of the vectorization.
220b57cec5SDimitry Andric // 3. InnerLoopVectorizer - A unit that performs the actual
230b57cec5SDimitry Andric // widening of instructions.
240b57cec5SDimitry Andric // 4. LoopVectorizationCostModel - A unit that checks for the profitability
250b57cec5SDimitry Andric // of vectorization. It decides on the optimal vector width, which
260b57cec5SDimitry Andric // can be one, if vectorization is not profitable.
270b57cec5SDimitry Andric //
280b57cec5SDimitry Andric // There is a development effort going on to migrate loop vectorizer to the
290b57cec5SDimitry Andric // VPlan infrastructure and to introduce outer loop vectorization support (see
30c9157d92SDimitry Andric // docs/VectorizationPlan.rst and
310b57cec5SDimitry Andric // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
320b57cec5SDimitry Andric // purpose, we temporarily introduced the VPlan-native vectorization path: an
330b57cec5SDimitry Andric // alternative vectorization path that is natively implemented on top of the
340b57cec5SDimitry Andric // VPlan infrastructure. See EnableVPlanNativePath for enabling.
350b57cec5SDimitry Andric //
360b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
370b57cec5SDimitry Andric //
380b57cec5SDimitry Andric // The reduction-variable vectorization is based on the paper:
390b57cec5SDimitry Andric // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
400b57cec5SDimitry Andric //
410b57cec5SDimitry Andric // Variable uniformity checks are inspired by:
420b57cec5SDimitry Andric // Karrenberg, R. and Hack, S. Whole Function Vectorization.
430b57cec5SDimitry Andric //
440b57cec5SDimitry Andric // The interleaved access vectorization is based on the paper:
450b57cec5SDimitry Andric // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
460b57cec5SDimitry Andric // Data for SIMD
470b57cec5SDimitry Andric //
480b57cec5SDimitry Andric // Other ideas/concepts are from:
490b57cec5SDimitry Andric // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
500b57cec5SDimitry Andric //
510b57cec5SDimitry Andric // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
520b57cec5SDimitry Andric // Vectorizing Compilers.
530b57cec5SDimitry Andric //
540b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
550b57cec5SDimitry Andric
560b57cec5SDimitry Andric #include "llvm/Transforms/Vectorize/LoopVectorize.h"
570b57cec5SDimitry Andric #include "LoopVectorizationPlanner.h"
580b57cec5SDimitry Andric #include "VPRecipeBuilder.h"
590b57cec5SDimitry Andric #include "VPlan.h"
60c9157d92SDimitry Andric #include "VPlanAnalysis.h"
610b57cec5SDimitry Andric #include "VPlanHCFGBuilder.h"
62480093f4SDimitry Andric #include "VPlanTransforms.h"
630b57cec5SDimitry Andric #include "llvm/ADT/APInt.h"
640b57cec5SDimitry Andric #include "llvm/ADT/ArrayRef.h"
650b57cec5SDimitry Andric #include "llvm/ADT/DenseMap.h"
660b57cec5SDimitry Andric #include "llvm/ADT/DenseMapInfo.h"
670b57cec5SDimitry Andric #include "llvm/ADT/Hashing.h"
680b57cec5SDimitry Andric #include "llvm/ADT/MapVector.h"
690b57cec5SDimitry Andric #include "llvm/ADT/STLExtras.h"
700b57cec5SDimitry Andric #include "llvm/ADT/SmallPtrSet.h"
71fe6060f1SDimitry Andric #include "llvm/ADT/SmallSet.h"
720b57cec5SDimitry Andric #include "llvm/ADT/SmallVector.h"
730b57cec5SDimitry Andric #include "llvm/ADT/Statistic.h"
740b57cec5SDimitry Andric #include "llvm/ADT/StringRef.h"
750b57cec5SDimitry Andric #include "llvm/ADT/Twine.h"
760b57cec5SDimitry Andric #include "llvm/ADT/iterator_range.h"
770b57cec5SDimitry Andric #include "llvm/Analysis/AssumptionCache.h"
780b57cec5SDimitry Andric #include "llvm/Analysis/BasicAliasAnalysis.h"
790b57cec5SDimitry Andric #include "llvm/Analysis/BlockFrequencyInfo.h"
800b57cec5SDimitry Andric #include "llvm/Analysis/CFG.h"
810b57cec5SDimitry Andric #include "llvm/Analysis/CodeMetrics.h"
820b57cec5SDimitry Andric #include "llvm/Analysis/DemandedBits.h"
830b57cec5SDimitry Andric #include "llvm/Analysis/GlobalsModRef.h"
840b57cec5SDimitry Andric #include "llvm/Analysis/LoopAccessAnalysis.h"
850b57cec5SDimitry Andric #include "llvm/Analysis/LoopAnalysisManager.h"
860b57cec5SDimitry Andric #include "llvm/Analysis/LoopInfo.h"
870b57cec5SDimitry Andric #include "llvm/Analysis/LoopIterator.h"
880b57cec5SDimitry Andric #include "llvm/Analysis/OptimizationRemarkEmitter.h"
890b57cec5SDimitry Andric #include "llvm/Analysis/ProfileSummaryInfo.h"
900b57cec5SDimitry Andric #include "llvm/Analysis/ScalarEvolution.h"
910b57cec5SDimitry Andric #include "llvm/Analysis/ScalarEvolutionExpressions.h"
920b57cec5SDimitry Andric #include "llvm/Analysis/TargetLibraryInfo.h"
930b57cec5SDimitry Andric #include "llvm/Analysis/TargetTransformInfo.h"
94fcaf7f86SDimitry Andric #include "llvm/Analysis/ValueTracking.h"
950b57cec5SDimitry Andric #include "llvm/Analysis/VectorUtils.h"
960b57cec5SDimitry Andric #include "llvm/IR/Attributes.h"
970b57cec5SDimitry Andric #include "llvm/IR/BasicBlock.h"
980b57cec5SDimitry Andric #include "llvm/IR/CFG.h"
990b57cec5SDimitry Andric #include "llvm/IR/Constant.h"
1000b57cec5SDimitry Andric #include "llvm/IR/Constants.h"
1010b57cec5SDimitry Andric #include "llvm/IR/DataLayout.h"
102fe013be4SDimitry Andric #include "llvm/IR/DebugInfo.h"
1030b57cec5SDimitry Andric #include "llvm/IR/DebugInfoMetadata.h"
1040b57cec5SDimitry Andric #include "llvm/IR/DebugLoc.h"
1050b57cec5SDimitry Andric #include "llvm/IR/DerivedTypes.h"
1060b57cec5SDimitry Andric #include "llvm/IR/DiagnosticInfo.h"
1070b57cec5SDimitry Andric #include "llvm/IR/Dominators.h"
1080b57cec5SDimitry Andric #include "llvm/IR/Function.h"
1090b57cec5SDimitry Andric #include "llvm/IR/IRBuilder.h"
1100b57cec5SDimitry Andric #include "llvm/IR/InstrTypes.h"
1110b57cec5SDimitry Andric #include "llvm/IR/Instruction.h"
1120b57cec5SDimitry Andric #include "llvm/IR/Instructions.h"
1130b57cec5SDimitry Andric #include "llvm/IR/IntrinsicInst.h"
1140b57cec5SDimitry Andric #include "llvm/IR/Intrinsics.h"
115c9157d92SDimitry Andric #include "llvm/IR/MDBuilder.h"
1160b57cec5SDimitry Andric #include "llvm/IR/Metadata.h"
1170b57cec5SDimitry Andric #include "llvm/IR/Module.h"
1180b57cec5SDimitry Andric #include "llvm/IR/Operator.h"
119fe6060f1SDimitry Andric #include "llvm/IR/PatternMatch.h"
120c9157d92SDimitry Andric #include "llvm/IR/ProfDataUtils.h"
1210b57cec5SDimitry Andric #include "llvm/IR/Type.h"
1220b57cec5SDimitry Andric #include "llvm/IR/Use.h"
1230b57cec5SDimitry Andric #include "llvm/IR/User.h"
1240b57cec5SDimitry Andric #include "llvm/IR/Value.h"
1250b57cec5SDimitry Andric #include "llvm/IR/ValueHandle.h"
1260b57cec5SDimitry Andric #include "llvm/IR/Verifier.h"
1270b57cec5SDimitry Andric #include "llvm/Support/Casting.h"
1280b57cec5SDimitry Andric #include "llvm/Support/CommandLine.h"
1290b57cec5SDimitry Andric #include "llvm/Support/Compiler.h"
1300b57cec5SDimitry Andric #include "llvm/Support/Debug.h"
1310b57cec5SDimitry Andric #include "llvm/Support/ErrorHandling.h"
132e8d8bef9SDimitry Andric #include "llvm/Support/InstructionCost.h"
1330b57cec5SDimitry Andric #include "llvm/Support/MathExtras.h"
1340b57cec5SDimitry Andric #include "llvm/Support/raw_ostream.h"
1350b57cec5SDimitry Andric #include "llvm/Transforms/Utils/BasicBlockUtils.h"
1365ffd83dbSDimitry Andric #include "llvm/Transforms/Utils/InjectTLIMappings.h"
1370b57cec5SDimitry Andric #include "llvm/Transforms/Utils/LoopSimplify.h"
1380b57cec5SDimitry Andric #include "llvm/Transforms/Utils/LoopUtils.h"
1390b57cec5SDimitry Andric #include "llvm/Transforms/Utils/LoopVersioning.h"
1405ffd83dbSDimitry Andric #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
1410b57cec5SDimitry Andric #include "llvm/Transforms/Utils/SizeOpts.h"
1420b57cec5SDimitry Andric #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
1430b57cec5SDimitry Andric #include <algorithm>
1440b57cec5SDimitry Andric #include <cassert>
145bdd1243dSDimitry Andric #include <cmath>
1460b57cec5SDimitry Andric #include <cstdint>
1470b57cec5SDimitry Andric #include <functional>
1480b57cec5SDimitry Andric #include <iterator>
1490b57cec5SDimitry Andric #include <limits>
15081ad6265SDimitry Andric #include <map>
1510b57cec5SDimitry Andric #include <memory>
1520b57cec5SDimitry Andric #include <string>
1530b57cec5SDimitry Andric #include <tuple>
1540b57cec5SDimitry Andric #include <utility>
1550b57cec5SDimitry Andric
1560b57cec5SDimitry Andric using namespace llvm;
1570b57cec5SDimitry Andric
1580b57cec5SDimitry Andric #define LV_NAME "loop-vectorize"
1590b57cec5SDimitry Andric #define DEBUG_TYPE LV_NAME
1600b57cec5SDimitry Andric
161e8d8bef9SDimitry Andric #ifndef NDEBUG
162e8d8bef9SDimitry Andric const char VerboseDebug[] = DEBUG_TYPE "-verbose";
163e8d8bef9SDimitry Andric #endif
164e8d8bef9SDimitry Andric
1650b57cec5SDimitry Andric /// @{
1660b57cec5SDimitry Andric /// Metadata attribute names
167e8d8bef9SDimitry Andric const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
168e8d8bef9SDimitry Andric const char LLVMLoopVectorizeFollowupVectorized[] =
1690b57cec5SDimitry Andric "llvm.loop.vectorize.followup_vectorized";
170e8d8bef9SDimitry Andric const char LLVMLoopVectorizeFollowupEpilogue[] =
1710b57cec5SDimitry Andric "llvm.loop.vectorize.followup_epilogue";
1720b57cec5SDimitry Andric /// @}
1730b57cec5SDimitry Andric
1740b57cec5SDimitry Andric STATISTIC(LoopsVectorized, "Number of loops vectorized");
1750b57cec5SDimitry Andric STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
176e8d8bef9SDimitry Andric STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
177e8d8bef9SDimitry Andric
178e8d8bef9SDimitry Andric static cl::opt<bool> EnableEpilogueVectorization(
179e8d8bef9SDimitry Andric "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
180e8d8bef9SDimitry Andric cl::desc("Enable vectorization of epilogue loops."));
181e8d8bef9SDimitry Andric
182e8d8bef9SDimitry Andric static cl::opt<unsigned> EpilogueVectorizationForceVF(
183e8d8bef9SDimitry Andric "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
184e8d8bef9SDimitry Andric cl::desc("When epilogue vectorization is enabled, and a value greater than "
185e8d8bef9SDimitry Andric "1 is specified, forces the given VF for all applicable epilogue "
186e8d8bef9SDimitry Andric "loops."));
187e8d8bef9SDimitry Andric
188e8d8bef9SDimitry Andric static cl::opt<unsigned> EpilogueVectorizationMinVF(
189e8d8bef9SDimitry Andric "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
190e8d8bef9SDimitry Andric cl::desc("Only loops with vectorization factor equal to or larger than "
191e8d8bef9SDimitry Andric "the specified value are considered for epilogue vectorization."));
1920b57cec5SDimitry Andric
1930b57cec5SDimitry Andric /// Loops with a known constant trip count below this number are vectorized only
1940b57cec5SDimitry Andric /// if no scalar iteration overheads are incurred.
1950b57cec5SDimitry Andric static cl::opt<unsigned> TinyTripCountVectorThreshold(
1960b57cec5SDimitry Andric "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
1970b57cec5SDimitry Andric cl::desc("Loops with a constant trip count that is smaller than this "
1980b57cec5SDimitry Andric "value are vectorized only if no scalar iteration overheads "
1990b57cec5SDimitry Andric "are incurred."));
2000b57cec5SDimitry Andric
201753f127fSDimitry Andric static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
202753f127fSDimitry Andric "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
203753f127fSDimitry Andric cl::desc("The maximum allowed number of runtime memory checks"));
204fe6060f1SDimitry Andric
205e8d8bef9SDimitry Andric // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
206e8d8bef9SDimitry Andric // that predication is preferred, and this lists all options. I.e., the
207e8d8bef9SDimitry Andric // vectorizer will try to fold the tail-loop (epilogue) into the vector body
208e8d8bef9SDimitry Andric // and predicate the instructions accordingly. If tail-folding fails, there are
209e8d8bef9SDimitry Andric // different fallback strategies depending on these values:
210e8d8bef9SDimitry Andric namespace PreferPredicateTy {
211e8d8bef9SDimitry Andric enum Option {
212e8d8bef9SDimitry Andric ScalarEpilogue = 0,
213e8d8bef9SDimitry Andric PredicateElseScalarEpilogue,
214e8d8bef9SDimitry Andric PredicateOrDontVectorize
215e8d8bef9SDimitry Andric };
216e8d8bef9SDimitry Andric } // namespace PreferPredicateTy
217e8d8bef9SDimitry Andric
218e8d8bef9SDimitry Andric static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
219e8d8bef9SDimitry Andric "prefer-predicate-over-epilogue",
220e8d8bef9SDimitry Andric cl::init(PreferPredicateTy::ScalarEpilogue),
221e8d8bef9SDimitry Andric cl::Hidden,
222e8d8bef9SDimitry Andric cl::desc("Tail-folding and predication preferences over creating a scalar "
223e8d8bef9SDimitry Andric "epilogue loop."),
224e8d8bef9SDimitry Andric cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
225e8d8bef9SDimitry Andric "scalar-epilogue",
226e8d8bef9SDimitry Andric "Don't tail-predicate loops, create scalar epilogue"),
227e8d8bef9SDimitry Andric clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
228e8d8bef9SDimitry Andric "predicate-else-scalar-epilogue",
229e8d8bef9SDimitry Andric "prefer tail-folding, create scalar epilogue if tail "
230e8d8bef9SDimitry Andric "folding fails."),
231e8d8bef9SDimitry Andric clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
232e8d8bef9SDimitry Andric "predicate-dont-vectorize",
233e8d8bef9SDimitry Andric "prefers tail-folding, don't attempt vectorization if "
234e8d8bef9SDimitry Andric "tail-folding fails.")));
2358bcb0991SDimitry Andric
236fe013be4SDimitry Andric static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
237fe013be4SDimitry Andric "force-tail-folding-style", cl::desc("Force the tail folding style"),
238fe013be4SDimitry Andric cl::init(TailFoldingStyle::None),
239fe013be4SDimitry Andric cl::values(
240fe013be4SDimitry Andric clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
241fe013be4SDimitry Andric clEnumValN(
242fe013be4SDimitry Andric TailFoldingStyle::Data, "data",
243fe013be4SDimitry Andric "Create lane mask for data only, using active.lane.mask intrinsic"),
244fe013be4SDimitry Andric clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
245fe013be4SDimitry Andric "data-without-lane-mask",
246fe013be4SDimitry Andric "Create lane mask with compare/stepvector"),
247fe013be4SDimitry Andric clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
248fe013be4SDimitry Andric "Create lane mask using active.lane.mask intrinsic, and use "
249fe013be4SDimitry Andric "it for both data and control flow"),
250fe013be4SDimitry Andric clEnumValN(
251fe013be4SDimitry Andric TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
252fe013be4SDimitry Andric "data-and-control-without-rt-check",
253fe013be4SDimitry Andric "Similar to data-and-control, but remove the runtime check")));
254fe013be4SDimitry Andric
2550b57cec5SDimitry Andric static cl::opt<bool> MaximizeBandwidth(
2560b57cec5SDimitry Andric "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
2570b57cec5SDimitry Andric cl::desc("Maximize bandwidth when selecting vectorization factor which "
2580b57cec5SDimitry Andric "will be determined by the smallest type in loop."));
2590b57cec5SDimitry Andric
2600b57cec5SDimitry Andric static cl::opt<bool> EnableInterleavedMemAccesses(
2610b57cec5SDimitry Andric "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
2620b57cec5SDimitry Andric cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
2630b57cec5SDimitry Andric
2640b57cec5SDimitry Andric /// An interleave-group may need masking if it resides in a block that needs
2650b57cec5SDimitry Andric /// predication, or in order to mask away gaps.
2660b57cec5SDimitry Andric static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
2670b57cec5SDimitry Andric "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
2680b57cec5SDimitry Andric cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
2690b57cec5SDimitry Andric
270480093f4SDimitry Andric static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
271480093f4SDimitry Andric "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
272480093f4SDimitry Andric cl::desc("We don't interleave loops with a estimated constant trip count "
273480093f4SDimitry Andric "below this number"));
2740b57cec5SDimitry Andric
2750b57cec5SDimitry Andric static cl::opt<unsigned> ForceTargetNumScalarRegs(
2760b57cec5SDimitry Andric "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
2770b57cec5SDimitry Andric cl::desc("A flag that overrides the target's number of scalar registers."));
2780b57cec5SDimitry Andric
2790b57cec5SDimitry Andric static cl::opt<unsigned> ForceTargetNumVectorRegs(
2800b57cec5SDimitry Andric "force-target-num-vector-regs", cl::init(0), cl::Hidden,
2810b57cec5SDimitry Andric cl::desc("A flag that overrides the target's number of vector registers."));
2820b57cec5SDimitry Andric
2830b57cec5SDimitry Andric static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
2840b57cec5SDimitry Andric "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
2850b57cec5SDimitry Andric cl::desc("A flag that overrides the target's max interleave factor for "
2860b57cec5SDimitry Andric "scalar loops."));
2870b57cec5SDimitry Andric
2880b57cec5SDimitry Andric static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
2890b57cec5SDimitry Andric "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
2900b57cec5SDimitry Andric cl::desc("A flag that overrides the target's max interleave factor for "
2910b57cec5SDimitry Andric "vectorized loops."));
2920b57cec5SDimitry Andric
2930b57cec5SDimitry Andric static cl::opt<unsigned> ForceTargetInstructionCost(
2940b57cec5SDimitry Andric "force-target-instruction-cost", cl::init(0), cl::Hidden,
2950b57cec5SDimitry Andric cl::desc("A flag that overrides the target's expected cost for "
2960b57cec5SDimitry Andric "an instruction to a single constant value. Mostly "
2970b57cec5SDimitry Andric "useful for getting consistent testing."));
2980b57cec5SDimitry Andric
299e8d8bef9SDimitry Andric static cl::opt<bool> ForceTargetSupportsScalableVectors(
300e8d8bef9SDimitry Andric "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
301e8d8bef9SDimitry Andric cl::desc(
302e8d8bef9SDimitry Andric "Pretend that scalable vectors are supported, even if the target does "
303e8d8bef9SDimitry Andric "not support them. This flag should only be used for testing."));
304e8d8bef9SDimitry Andric
3050b57cec5SDimitry Andric static cl::opt<unsigned> SmallLoopCost(
3060b57cec5SDimitry Andric "small-loop-cost", cl::init(20), cl::Hidden,
3070b57cec5SDimitry Andric cl::desc(
3080b57cec5SDimitry Andric "The cost of a loop that is considered 'small' by the interleaver."));
3090b57cec5SDimitry Andric
3100b57cec5SDimitry Andric static cl::opt<bool> LoopVectorizeWithBlockFrequency(
3110b57cec5SDimitry Andric "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
3120b57cec5SDimitry Andric cl::desc("Enable the use of the block frequency analysis to access PGO "
3130b57cec5SDimitry Andric "heuristics minimizing code growth in cold regions and being more "
3140b57cec5SDimitry Andric "aggressive in hot regions."));
3150b57cec5SDimitry Andric
3160b57cec5SDimitry Andric // Runtime interleave loops for load/store throughput.
3170b57cec5SDimitry Andric static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
3180b57cec5SDimitry Andric "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
3190b57cec5SDimitry Andric cl::desc(
3200b57cec5SDimitry Andric "Enable runtime interleaving until load/store ports are saturated"));
3210b57cec5SDimitry Andric
322e8d8bef9SDimitry Andric /// Interleave small loops with scalar reductions.
323e8d8bef9SDimitry Andric static cl::opt<bool> InterleaveSmallLoopScalarReduction(
324e8d8bef9SDimitry Andric "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
325e8d8bef9SDimitry Andric cl::desc("Enable interleaving for loops with small iteration counts that "
326e8d8bef9SDimitry Andric "contain scalar reductions to expose ILP."));
327e8d8bef9SDimitry Andric
3280b57cec5SDimitry Andric /// The number of stores in a loop that are allowed to need predication.
3290b57cec5SDimitry Andric static cl::opt<unsigned> NumberOfStoresToPredicate(
3300b57cec5SDimitry Andric "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
3310b57cec5SDimitry Andric cl::desc("Max number of stores to be predicated behind an if."));
3320b57cec5SDimitry Andric
3330b57cec5SDimitry Andric static cl::opt<bool> EnableIndVarRegisterHeur(
3340b57cec5SDimitry Andric "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
3350b57cec5SDimitry Andric cl::desc("Count the induction variable only once when interleaving"));
3360b57cec5SDimitry Andric
3370b57cec5SDimitry Andric static cl::opt<bool> EnableCondStoresVectorization(
3380b57cec5SDimitry Andric "enable-cond-stores-vec", cl::init(true), cl::Hidden,
3390b57cec5SDimitry Andric cl::desc("Enable if predication of stores during vectorization."));
3400b57cec5SDimitry Andric
3410b57cec5SDimitry Andric static cl::opt<unsigned> MaxNestedScalarReductionIC(
3420b57cec5SDimitry Andric "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
3430b57cec5SDimitry Andric cl::desc("The maximum interleave count to use when interleaving a scalar "
3440b57cec5SDimitry Andric "reduction in a nested loop."));
3450b57cec5SDimitry Andric
346e8d8bef9SDimitry Andric static cl::opt<bool>
347e8d8bef9SDimitry Andric PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
348e8d8bef9SDimitry Andric cl::Hidden,
349e8d8bef9SDimitry Andric cl::desc("Prefer in-loop vector reductions, "
350e8d8bef9SDimitry Andric "overriding the targets preference."));
351e8d8bef9SDimitry Andric
352349cc55cSDimitry Andric static cl::opt<bool> ForceOrderedReductions(
353349cc55cSDimitry Andric "force-ordered-reductions", cl::init(false), cl::Hidden,
354fe6060f1SDimitry Andric cl::desc("Enable the vectorisation of loops with in-order (strict) "
355fe6060f1SDimitry Andric "FP reductions"));
356fe6060f1SDimitry Andric
357e8d8bef9SDimitry Andric static cl::opt<bool> PreferPredicatedReductionSelect(
358e8d8bef9SDimitry Andric "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
359e8d8bef9SDimitry Andric cl::desc(
360e8d8bef9SDimitry Andric "Prefer predicating a reduction operation over an after loop select."));
361e8d8bef9SDimitry Andric
362fe013be4SDimitry Andric namespace llvm {
3630b57cec5SDimitry Andric cl::opt<bool> EnableVPlanNativePath(
364fe013be4SDimitry Andric "enable-vplan-native-path", cl::Hidden,
3650b57cec5SDimitry Andric cl::desc("Enable VPlan-native vectorization path with "
3660b57cec5SDimitry Andric "support for outer loop vectorization."));
367fe013be4SDimitry Andric }
3680b57cec5SDimitry Andric
3690b57cec5SDimitry Andric // This flag enables the stress testing of the VPlan H-CFG construction in the
3700b57cec5SDimitry Andric // VPlan-native vectorization path. It must be used in conjuction with
3710b57cec5SDimitry Andric // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
3720b57cec5SDimitry Andric // verification of the H-CFGs built.
3730b57cec5SDimitry Andric static cl::opt<bool> VPlanBuildStressTest(
3740b57cec5SDimitry Andric "vplan-build-stress-test", cl::init(false), cl::Hidden,
3750b57cec5SDimitry Andric cl::desc(
3760b57cec5SDimitry Andric "Build VPlan for every supported loop nest in the function and bail "
3770b57cec5SDimitry Andric "out right after the build (stress test the VPlan H-CFG construction "
3780b57cec5SDimitry Andric "in the VPlan-native vectorization path)."));
3790b57cec5SDimitry Andric
3800b57cec5SDimitry Andric cl::opt<bool> llvm::EnableLoopInterleaving(
3810b57cec5SDimitry Andric "interleave-loops", cl::init(true), cl::Hidden,
3820b57cec5SDimitry Andric cl::desc("Enable loop interleaving in Loop vectorization passes"));
3830b57cec5SDimitry Andric cl::opt<bool> llvm::EnableLoopVectorization(
3840b57cec5SDimitry Andric "vectorize-loops", cl::init(true), cl::Hidden,
3850b57cec5SDimitry Andric cl::desc("Run the Loop vectorization passes"));
3860b57cec5SDimitry Andric
387bdd1243dSDimitry Andric static cl::opt<bool> PrintVPlansInDotFormat(
388bdd1243dSDimitry Andric "vplan-print-in-dot-format", cl::Hidden,
389fe6060f1SDimitry Andric cl::desc("Use dot format instead of plain text when dumping VPlans"));
3900b57cec5SDimitry Andric
391bdd1243dSDimitry Andric static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
392bdd1243dSDimitry Andric "force-widen-divrem-via-safe-divisor", cl::Hidden,
393bdd1243dSDimitry Andric cl::desc(
394bdd1243dSDimitry Andric "Override cost based safe divisor widening for div/rem instructions"));
395bdd1243dSDimitry Andric
396c9157d92SDimitry Andric static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
397c9157d92SDimitry Andric "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
398c9157d92SDimitry Andric cl::Hidden,
399c9157d92SDimitry Andric cl::desc("Try wider VFs if they enable the use of vector variants"));
400c9157d92SDimitry Andric
401c9157d92SDimitry Andric // Likelyhood of bypassing the vectorized loop because assumptions about SCEV
402c9157d92SDimitry Andric // variables not overflowing do not hold. See `emitSCEVChecks`.
403c9157d92SDimitry Andric static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
404c9157d92SDimitry Andric // Likelyhood of bypassing the vectorized loop because pointers overlap. See
405c9157d92SDimitry Andric // `emitMemRuntimeChecks`.
406c9157d92SDimitry Andric static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
407c9157d92SDimitry Andric // Likelyhood of bypassing the vectorized loop because there are zero trips left
408c9157d92SDimitry Andric // after prolog. See `emitIterationCountCheck`.
409c9157d92SDimitry Andric static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
410c9157d92SDimitry Andric
4110b57cec5SDimitry Andric /// A helper function that returns true if the given type is irregular. The
4120b57cec5SDimitry Andric /// type is irregular if its allocated size doesn't equal the store size of an
413d409305fSDimitry Andric /// element of the corresponding vector type.
hasIrregularType(Type * Ty,const DataLayout & DL)414d409305fSDimitry Andric static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
415d409305fSDimitry Andric // Determine if an array of N elements of type Ty is "bitcast compatible"
416d409305fSDimitry Andric // with a <N x Ty> vector.
417d409305fSDimitry Andric // This is only true if there is no padding between the array elements.
4180b57cec5SDimitry Andric return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
4190b57cec5SDimitry Andric }
4200b57cec5SDimitry Andric
4210b57cec5SDimitry Andric /// A helper function that returns the reciprocal of the block probability of
4220b57cec5SDimitry Andric /// predicated blocks. If we return X, we are assuming the predicated block
4230b57cec5SDimitry Andric /// will execute once for every X iterations of the loop header.
4240b57cec5SDimitry Andric ///
4250b57cec5SDimitry Andric /// TODO: We should use actual block probability here, if available. Currently,
4260b57cec5SDimitry Andric /// we always assume predicated blocks have a 50% chance of executing.
getReciprocalPredBlockProb()4270b57cec5SDimitry Andric static unsigned getReciprocalPredBlockProb() { return 2; }
4280b57cec5SDimitry Andric
4298bcb0991SDimitry Andric /// Returns "best known" trip count for the specified loop \p L as defined by
4308bcb0991SDimitry Andric /// the following procedure:
4318bcb0991SDimitry Andric /// 1) Returns exact trip count if it is known.
4328bcb0991SDimitry Andric /// 2) Returns expected trip count according to profile data if any.
4338bcb0991SDimitry Andric /// 3) Returns upper bound estimate if it is known.
434bdd1243dSDimitry Andric /// 4) Returns std::nullopt if all of the above failed.
getSmallBestKnownTC(ScalarEvolution & SE,Loop * L)435bdd1243dSDimitry Andric static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE,
436bdd1243dSDimitry Andric Loop *L) {
4378bcb0991SDimitry Andric // Check if exact trip count is known.
4388bcb0991SDimitry Andric if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
4398bcb0991SDimitry Andric return ExpectedTC;
4408bcb0991SDimitry Andric
4418bcb0991SDimitry Andric // Check if there is an expected trip count available from profile data.
4428bcb0991SDimitry Andric if (LoopVectorizeWithBlockFrequency)
4438bcb0991SDimitry Andric if (auto EstimatedTC = getLoopEstimatedTripCount(L))
444bdd1243dSDimitry Andric return *EstimatedTC;
4458bcb0991SDimitry Andric
4468bcb0991SDimitry Andric // Check if upper bound estimate is known.
4478bcb0991SDimitry Andric if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
4488bcb0991SDimitry Andric return ExpectedTC;
4498bcb0991SDimitry Andric
450bdd1243dSDimitry Andric return std::nullopt;
4518bcb0991SDimitry Andric }
4528bcb0991SDimitry Andric
453fe013be4SDimitry Andric /// Return a vector containing interleaved elements from multiple
454fe013be4SDimitry Andric /// smaller input vectors.
interleaveVectors(IRBuilderBase & Builder,ArrayRef<Value * > Vals,const Twine & Name)455fe013be4SDimitry Andric static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
456fe013be4SDimitry Andric const Twine &Name) {
457fe013be4SDimitry Andric unsigned Factor = Vals.size();
458fe013be4SDimitry Andric assert(Factor > 1 && "Tried to interleave invalid number of vectors");
459fe013be4SDimitry Andric
460fe013be4SDimitry Andric VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
461fe013be4SDimitry Andric #ifndef NDEBUG
462fe013be4SDimitry Andric for (Value *Val : Vals)
463fe013be4SDimitry Andric assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
464fe013be4SDimitry Andric #endif
465fe013be4SDimitry Andric
466fe013be4SDimitry Andric // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
467fe013be4SDimitry Andric // must use intrinsics to interleave.
468fe013be4SDimitry Andric if (VecTy->isScalableTy()) {
469fe013be4SDimitry Andric VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
470fe013be4SDimitry Andric return Builder.CreateIntrinsic(
471fe013be4SDimitry Andric WideVecTy, Intrinsic::experimental_vector_interleave2, Vals,
472fe013be4SDimitry Andric /*FMFSource=*/nullptr, Name);
473fe013be4SDimitry Andric }
474fe013be4SDimitry Andric
475fe013be4SDimitry Andric // Fixed length. Start by concatenating all vectors into a wide vector.
476fe013be4SDimitry Andric Value *WideVec = concatenateVectors(Builder, Vals);
477fe013be4SDimitry Andric
478fe013be4SDimitry Andric // Interleave the elements into the wide vector.
479fe013be4SDimitry Andric const unsigned NumElts = VecTy->getElementCount().getFixedValue();
480fe013be4SDimitry Andric return Builder.CreateShuffleVector(
481fe013be4SDimitry Andric WideVec, createInterleaveMask(NumElts, Factor), Name);
482fe013be4SDimitry Andric }
483fe013be4SDimitry Andric
484bdd1243dSDimitry Andric namespace {
485fe6060f1SDimitry Andric // Forward declare GeneratedRTChecks.
486fe6060f1SDimitry Andric class GeneratedRTChecks;
487fe013be4SDimitry Andric
488fe013be4SDimitry Andric using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
489bdd1243dSDimitry Andric } // namespace
490fe6060f1SDimitry Andric
4910b57cec5SDimitry Andric namespace llvm {
4920b57cec5SDimitry Andric
4930eae32dcSDimitry Andric AnalysisKey ShouldRunExtraVectorPasses::Key;
4940eae32dcSDimitry Andric
4950b57cec5SDimitry Andric /// InnerLoopVectorizer vectorizes loops which contain only one basic
4960b57cec5SDimitry Andric /// block to a specified vectorization factor (VF).
4970b57cec5SDimitry Andric /// This class performs the widening of scalars into vectors, or multiple
4980b57cec5SDimitry Andric /// scalars. This class also implements the following features:
4990b57cec5SDimitry Andric /// * It inserts an epilogue loop for handling loops that don't have iteration
5000b57cec5SDimitry Andric /// counts that are known to be a multiple of the vectorization factor.
5010b57cec5SDimitry Andric /// * It handles the code generation for reduction variables.
5020b57cec5SDimitry Andric /// * Scalarization (implementation using scalars) of un-vectorizable
5030b57cec5SDimitry Andric /// instructions.
5040b57cec5SDimitry Andric /// InnerLoopVectorizer does not perform any vectorization-legality
5050b57cec5SDimitry Andric /// checks, and relies on the caller to check for the different legality
5060b57cec5SDimitry Andric /// aspects. The InnerLoopVectorizer relies on the
5070b57cec5SDimitry Andric /// LoopVectorizationLegality class to provide information about the induction
5080b57cec5SDimitry Andric /// and reduction variables that were found to a given vectorization factor.
5090b57cec5SDimitry Andric class InnerLoopVectorizer {
5100b57cec5SDimitry Andric public:
InnerLoopVectorizer(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,ElementCount VecWidth,ElementCount MinProfitableTripCount,unsigned UnrollFactor,LoopVectorizationLegality * LVL,LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,GeneratedRTChecks & RTChecks)5110b57cec5SDimitry Andric InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
5120b57cec5SDimitry Andric LoopInfo *LI, DominatorTree *DT,
5130b57cec5SDimitry Andric const TargetLibraryInfo *TLI,
5140b57cec5SDimitry Andric const TargetTransformInfo *TTI, AssumptionCache *AC,
515e8d8bef9SDimitry Andric OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
516753f127fSDimitry Andric ElementCount MinProfitableTripCount,
5170b57cec5SDimitry Andric unsigned UnrollFactor, LoopVectorizationLegality *LVL,
518e8d8bef9SDimitry Andric LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
519fe6060f1SDimitry Andric ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
5200b57cec5SDimitry Andric : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
5210b57cec5SDimitry Andric AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
522fe6060f1SDimitry Andric Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
523fe6060f1SDimitry Andric PSI(PSI), RTChecks(RTChecks) {
524e8d8bef9SDimitry Andric // Query this against the original loop and save it here because the profile
525e8d8bef9SDimitry Andric // of the original loop header may change as the transformation happens.
526e8d8bef9SDimitry Andric OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
527e8d8bef9SDimitry Andric OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
528753f127fSDimitry Andric
529753f127fSDimitry Andric if (MinProfitableTripCount.isZero())
530753f127fSDimitry Andric this->MinProfitableTripCount = VecWidth;
531753f127fSDimitry Andric else
532753f127fSDimitry Andric this->MinProfitableTripCount = MinProfitableTripCount;
533e8d8bef9SDimitry Andric }
534e8d8bef9SDimitry Andric
5350b57cec5SDimitry Andric virtual ~InnerLoopVectorizer() = default;
5360b57cec5SDimitry Andric
537e8d8bef9SDimitry Andric /// Create a new empty loop that will contain vectorized instructions later
538e8d8bef9SDimitry Andric /// on, while the old loop will be used as the scalar remainder. Control flow
539e8d8bef9SDimitry Andric /// is generated around the vectorized (and scalar epilogue) loops consisting
540e8d8bef9SDimitry Andric /// of various checks and bypasses. Return the pre-header block of the new
54104eeddc0SDimitry Andric /// loop and the start value for the canonical induction, if it is != 0. The
54204eeddc0SDimitry Andric /// latter is the case when vectorizing the epilogue loop. In the case of
54304eeddc0SDimitry Andric /// epilogue vectorization, this function is overriden to handle the more
544fe013be4SDimitry Andric /// complex control flow around the loops. \p ExpandedSCEVs is used to
545fe013be4SDimitry Andric /// look up SCEV expansions for expressions needed during skeleton creation.
546fe013be4SDimitry Andric virtual std::pair<BasicBlock *, Value *>
547fe013be4SDimitry Andric createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
5480b57cec5SDimitry Andric
5490b57cec5SDimitry Andric /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
55081ad6265SDimitry Andric void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
5510b57cec5SDimitry Andric
5520b57cec5SDimitry Andric // Return true if any runtime check is added.
areSafetyChecksAdded()5530b57cec5SDimitry Andric bool areSafetyChecksAdded() { return AddedSafetyChecks; }
5540b57cec5SDimitry Andric
5550b57cec5SDimitry Andric /// A type for vectorized values in the new loop. Each value from the
5560b57cec5SDimitry Andric /// original loop, when vectorized, is represented by UF vector values in the
5570b57cec5SDimitry Andric /// new unrolled loop, where UF is the unroll factor.
5580b57cec5SDimitry Andric using VectorParts = SmallVector<Value *, 2>;
5590b57cec5SDimitry Andric
5600b57cec5SDimitry Andric /// A helper function to scalarize a single Instruction in the innermost loop.
5610b57cec5SDimitry Andric /// Generates a sequence of scalar instances for each lane between \p MinLane
5620b57cec5SDimitry Andric /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
5634824e7fdSDimitry Andric /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
5645ffd83dbSDimitry Andric /// Instr's operands.
565bdd1243dSDimitry Andric void scalarizeInstruction(const Instruction *Instr,
566bdd1243dSDimitry Andric VPReplicateRecipe *RepRecipe,
567fe013be4SDimitry Andric const VPIteration &Instance,
5685ffd83dbSDimitry Andric VPTransformState &State);
5690b57cec5SDimitry Andric
5705ffd83dbSDimitry Andric /// Try to vectorize interleaved access group \p Group with the base address
5715ffd83dbSDimitry Andric /// given in \p Addr, optionally masking the vector operations if \p
5725ffd83dbSDimitry Andric /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
5735ffd83dbSDimitry Andric /// values in the vectorized loop.
5745ffd83dbSDimitry Andric void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
575e8d8bef9SDimitry Andric ArrayRef<VPValue *> VPDefs,
5765ffd83dbSDimitry Andric VPTransformState &State, VPValue *Addr,
577e8d8bef9SDimitry Andric ArrayRef<VPValue *> StoredValues,
578fe013be4SDimitry Andric VPValue *BlockInMask, bool NeedsMaskForGaps);
5790b57cec5SDimitry Andric
58081ad6265SDimitry Andric /// Fix the non-induction PHIs in \p Plan.
58181ad6265SDimitry Andric void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);
582fe6060f1SDimitry Andric
583fe6060f1SDimitry Andric /// Returns true if the reordering of FP operations is not allowed, but we are
584fe6060f1SDimitry Andric /// able to vectorize with strict in-order reductions for the given RdxDesc.
5850eae32dcSDimitry Andric bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
586fe6060f1SDimitry Andric
587bdd1243dSDimitry Andric /// Create a new phi node for the induction variable \p OrigPhi to resume
588bdd1243dSDimitry Andric /// iteration count in the scalar epilogue, from where the vectorized loop
589fe013be4SDimitry Andric /// left off. \p Step is the SCEV-expanded induction step to use. In cases
590fe013be4SDimitry Andric /// where the loop skeleton is more complicated (i.e., epilogue vectorization)
591fe013be4SDimitry Andric /// and the resume values can come from an additional bypass block, the \p
592fe013be4SDimitry Andric /// AdditionalBypass pair provides information about the bypass block and the
593fe013be4SDimitry Andric /// end value on the edge from bypass to this loop.
594bdd1243dSDimitry Andric PHINode *createInductionResumeValue(
595fe013be4SDimitry Andric PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step,
596bdd1243dSDimitry Andric ArrayRef<BasicBlock *> BypassBlocks,
597bdd1243dSDimitry Andric std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
598bdd1243dSDimitry Andric
599fe013be4SDimitry Andric /// Returns the original loop trip count.
getTripCount() const600fe013be4SDimitry Andric Value *getTripCount() const { return TripCount; }
601fe013be4SDimitry Andric
602fe013be4SDimitry Andric /// Used to set the trip count after ILV's construction and after the
603fe013be4SDimitry Andric /// preheader block has been executed. Note that this always holds the trip
604fe013be4SDimitry Andric /// count of the original loop for both main loop and epilogue vectorization.
setTripCount(Value * TC)605fe013be4SDimitry Andric void setTripCount(Value *TC) { TripCount = TC; }
606fe013be4SDimitry Andric
6070b57cec5SDimitry Andric protected:
6080b57cec5SDimitry Andric friend class LoopVectorizationPlanner;
6090b57cec5SDimitry Andric
6100b57cec5SDimitry Andric /// A small list of PHINodes.
6110b57cec5SDimitry Andric using PhiVector = SmallVector<PHINode *, 4>;
6120b57cec5SDimitry Andric
6130b57cec5SDimitry Andric /// A type for scalarized values in the new loop. Each value from the
6140b57cec5SDimitry Andric /// original loop, when scalarized, is represented by UF x VF scalar values
6150b57cec5SDimitry Andric /// in the new unrolled loop, where UF is the unroll factor and VF is the
6160b57cec5SDimitry Andric /// vectorization factor.
6170b57cec5SDimitry Andric using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
6180b57cec5SDimitry Andric
6190b57cec5SDimitry Andric /// Set up the values of the IVs correctly when exiting the vector loop.
6200b57cec5SDimitry Andric void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
62181ad6265SDimitry Andric Value *VectorTripCount, Value *EndValue,
62281ad6265SDimitry Andric BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
623fe013be4SDimitry Andric VPlan &Plan, VPTransformState &State);
6240b57cec5SDimitry Andric
625349cc55cSDimitry Andric /// Create the exit value of first order recurrences in the middle block and
626349cc55cSDimitry Andric /// update their users.
627bdd1243dSDimitry Andric void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
62804eeddc0SDimitry Andric VPTransformState &State);
6290b57cec5SDimitry Andric
630349cc55cSDimitry Andric /// Create code for the loop exit value of the reduction.
631fe6060f1SDimitry Andric void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
6320b57cec5SDimitry Andric
6330b57cec5SDimitry Andric /// Iteratively sink the scalarized operands of a predicated instruction into
6340b57cec5SDimitry Andric /// the block that was created for it.
6350b57cec5SDimitry Andric void sinkScalarOperands(Instruction *PredInst);
6360b57cec5SDimitry Andric
6370b57cec5SDimitry Andric /// Returns (and creates if needed) the trip count of the widened loop.
63881ad6265SDimitry Andric Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
6390b57cec5SDimitry Andric
6400b57cec5SDimitry Andric /// Returns a bitcasted value to the requested vector type.
6410b57cec5SDimitry Andric /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
6420b57cec5SDimitry Andric Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
6430b57cec5SDimitry Andric const DataLayout &DL);
6440b57cec5SDimitry Andric
6450b57cec5SDimitry Andric /// Emit a bypass check to see if the vector trip count is zero, including if
6460b57cec5SDimitry Andric /// it overflows.
64781ad6265SDimitry Andric void emitIterationCountCheck(BasicBlock *Bypass);
6480b57cec5SDimitry Andric
6490b57cec5SDimitry Andric /// Emit a bypass check to see if all of the SCEV assumptions we've
650fe6060f1SDimitry Andric /// had to make are correct. Returns the block containing the checks or
651fe6060f1SDimitry Andric /// nullptr if no checks have been added.
65281ad6265SDimitry Andric BasicBlock *emitSCEVChecks(BasicBlock *Bypass);
6530b57cec5SDimitry Andric
6540b57cec5SDimitry Andric /// Emit bypass checks to check any memory assumptions we may have made.
655fe6060f1SDimitry Andric /// Returns the block containing the checks or nullptr if no checks have been
656fe6060f1SDimitry Andric /// added.
65781ad6265SDimitry Andric BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass);
6580b57cec5SDimitry Andric
659e8d8bef9SDimitry Andric /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
66081ad6265SDimitry Andric /// vector loop preheader, middle block and scalar preheader.
66181ad6265SDimitry Andric void createVectorLoopSkeleton(StringRef Prefix);
662e8d8bef9SDimitry Andric
663e8d8bef9SDimitry Andric /// Create new phi nodes for the induction variables to resume iteration count
66404eeddc0SDimitry Andric /// in the scalar epilogue, from where the vectorized loop left off.
665e8d8bef9SDimitry Andric /// In cases where the loop skeleton is more complicated (eg. epilogue
666e8d8bef9SDimitry Andric /// vectorization) and the resume values can come from an additional bypass
667e8d8bef9SDimitry Andric /// block, the \p AdditionalBypass pair provides information about the bypass
668e8d8bef9SDimitry Andric /// block and the end value on the edge from bypass to this loop.
669e8d8bef9SDimitry Andric void createInductionResumeValues(
670fe013be4SDimitry Andric const SCEV2ValueTy &ExpandedSCEVs,
671e8d8bef9SDimitry Andric std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
672e8d8bef9SDimitry Andric
673e8d8bef9SDimitry Andric /// Complete the loop skeleton by adding debug MDs, creating appropriate
674e8d8bef9SDimitry Andric /// conditional branches in the middle block, preparing the builder and
67581ad6265SDimitry Andric /// running the verifier. Return the preheader of the completed vector loop.
676bdd1243dSDimitry Andric BasicBlock *completeLoopSkeleton();
6770b57cec5SDimitry Andric
6784824e7fdSDimitry Andric /// Collect poison-generating recipes that may generate a poison value that is
6794824e7fdSDimitry Andric /// used after vectorization, even when their operands are not poison. Those
6804824e7fdSDimitry Andric /// recipes meet the following conditions:
6814824e7fdSDimitry Andric /// * Contribute to the address computation of a recipe generating a widen
6824824e7fdSDimitry Andric /// memory load/store (VPWidenMemoryInstructionRecipe or
6834824e7fdSDimitry Andric /// VPInterleaveRecipe).
6844824e7fdSDimitry Andric /// * Such a widen memory load/store has at least one underlying Instruction
6854824e7fdSDimitry Andric /// that is in a basic block that needs predication and after vectorization
6864824e7fdSDimitry Andric /// the generated instruction won't be predicated.
6874824e7fdSDimitry Andric void collectPoisonGeneratingRecipes(VPTransformState &State);
6880b57cec5SDimitry Andric
689e8d8bef9SDimitry Andric /// Allow subclasses to override and print debug traces before/after vplan
690e8d8bef9SDimitry Andric /// execution, when trace information is requested.
printDebugTracesAtStart()691e8d8bef9SDimitry Andric virtual void printDebugTracesAtStart(){};
printDebugTracesAtEnd()692e8d8bef9SDimitry Andric virtual void printDebugTracesAtEnd(){};
693e8d8bef9SDimitry Andric
6940b57cec5SDimitry Andric /// The original loop.
6950b57cec5SDimitry Andric Loop *OrigLoop;
6960b57cec5SDimitry Andric
6970b57cec5SDimitry Andric /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
6980b57cec5SDimitry Andric /// dynamic knowledge to simplify SCEV expressions and converts them to a
6990b57cec5SDimitry Andric /// more usable form.
7000b57cec5SDimitry Andric PredicatedScalarEvolution &PSE;
7010b57cec5SDimitry Andric
7020b57cec5SDimitry Andric /// Loop Info.
7030b57cec5SDimitry Andric LoopInfo *LI;
7040b57cec5SDimitry Andric
7050b57cec5SDimitry Andric /// Dominator Tree.
7060b57cec5SDimitry Andric DominatorTree *DT;
7070b57cec5SDimitry Andric
7080b57cec5SDimitry Andric /// Target Library Info.
7090b57cec5SDimitry Andric const TargetLibraryInfo *TLI;
7100b57cec5SDimitry Andric
7110b57cec5SDimitry Andric /// Target Transform Info.
7120b57cec5SDimitry Andric const TargetTransformInfo *TTI;
7130b57cec5SDimitry Andric
7140b57cec5SDimitry Andric /// Assumption Cache.
7150b57cec5SDimitry Andric AssumptionCache *AC;
7160b57cec5SDimitry Andric
7170b57cec5SDimitry Andric /// Interface to emit optimization remarks.
7180b57cec5SDimitry Andric OptimizationRemarkEmitter *ORE;
7190b57cec5SDimitry Andric
7200b57cec5SDimitry Andric /// The vectorization SIMD factor to use. Each vector will have this many
7210b57cec5SDimitry Andric /// vector elements.
722e8d8bef9SDimitry Andric ElementCount VF;
7230b57cec5SDimitry Andric
724753f127fSDimitry Andric ElementCount MinProfitableTripCount;
725753f127fSDimitry Andric
7260b57cec5SDimitry Andric /// The vectorization unroll factor to use. Each scalar is vectorized to this
7270b57cec5SDimitry Andric /// many different vector instructions.
7280b57cec5SDimitry Andric unsigned UF;
7290b57cec5SDimitry Andric
7300b57cec5SDimitry Andric /// The builder that we use
7310b57cec5SDimitry Andric IRBuilder<> Builder;
7320b57cec5SDimitry Andric
7330b57cec5SDimitry Andric // --- Vectorization state ---
7340b57cec5SDimitry Andric
7350b57cec5SDimitry Andric /// The vector-loop preheader.
7360b57cec5SDimitry Andric BasicBlock *LoopVectorPreHeader;
7370b57cec5SDimitry Andric
7380b57cec5SDimitry Andric /// The scalar-loop preheader.
7390b57cec5SDimitry Andric BasicBlock *LoopScalarPreHeader;
7400b57cec5SDimitry Andric
7410b57cec5SDimitry Andric /// Middle Block between the vector and the scalar.
7420b57cec5SDimitry Andric BasicBlock *LoopMiddleBlock;
7430b57cec5SDimitry Andric
744fe6060f1SDimitry Andric /// The unique ExitBlock of the scalar loop if one exists. Note that
745e8d8bef9SDimitry Andric /// there can be multiple exiting edges reaching this block.
7460b57cec5SDimitry Andric BasicBlock *LoopExitBlock;
7470b57cec5SDimitry Andric
7480b57cec5SDimitry Andric /// The scalar loop body.
7490b57cec5SDimitry Andric BasicBlock *LoopScalarBody;
7500b57cec5SDimitry Andric
7510b57cec5SDimitry Andric /// A list of all bypass blocks. The first block is the entry of the loop.
7520b57cec5SDimitry Andric SmallVector<BasicBlock *, 4> LoopBypassBlocks;
7530b57cec5SDimitry Andric
7540b57cec5SDimitry Andric /// Store instructions that were predicated.
7550b57cec5SDimitry Andric SmallVector<Instruction *, 4> PredicatedInstructions;
7560b57cec5SDimitry Andric
7570b57cec5SDimitry Andric /// Trip count of the original loop.
7580b57cec5SDimitry Andric Value *TripCount = nullptr;
7590b57cec5SDimitry Andric
7600b57cec5SDimitry Andric /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
7610b57cec5SDimitry Andric Value *VectorTripCount = nullptr;
7620b57cec5SDimitry Andric
7630b57cec5SDimitry Andric /// The legality analysis.
7640b57cec5SDimitry Andric LoopVectorizationLegality *Legal;
7650b57cec5SDimitry Andric
7660b57cec5SDimitry Andric /// The profitablity analysis.
7670b57cec5SDimitry Andric LoopVectorizationCostModel *Cost;
7680b57cec5SDimitry Andric
7690b57cec5SDimitry Andric // Record whether runtime checks are added.
7700b57cec5SDimitry Andric bool AddedSafetyChecks = false;
7710b57cec5SDimitry Andric
7720b57cec5SDimitry Andric // Holds the end values for each induction variable. We save the end values
7730b57cec5SDimitry Andric // so we can later fix-up the external users of the induction variables.
7740b57cec5SDimitry Andric DenseMap<PHINode *, Value *> IVEndValues;
7750b57cec5SDimitry Andric
776e8d8bef9SDimitry Andric /// BFI and PSI are used to check for profile guided size optimizations.
777e8d8bef9SDimitry Andric BlockFrequencyInfo *BFI;
778e8d8bef9SDimitry Andric ProfileSummaryInfo *PSI;
779e8d8bef9SDimitry Andric
780e8d8bef9SDimitry Andric // Whether this loop should be optimized for size based on profile guided size
781e8d8bef9SDimitry Andric // optimizatios.
782e8d8bef9SDimitry Andric bool OptForSizeBasedOnProfile;
783fe6060f1SDimitry Andric
784fe6060f1SDimitry Andric /// Structure to hold information about generated runtime checks, responsible
785fe6060f1SDimitry Andric /// for cleaning the checks, if vectorization turns out unprofitable.
786fe6060f1SDimitry Andric GeneratedRTChecks &RTChecks;
78704eeddc0SDimitry Andric
78804eeddc0SDimitry Andric // Holds the resume values for reductions in the loops, used to set the
78904eeddc0SDimitry Andric // correct start value of reduction PHIs when vectorizing the epilogue.
79004eeddc0SDimitry Andric SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4>
79104eeddc0SDimitry Andric ReductionResumeValues;
7920b57cec5SDimitry Andric };
7930b57cec5SDimitry Andric
7940b57cec5SDimitry Andric class InnerLoopUnroller : public InnerLoopVectorizer {
7950b57cec5SDimitry Andric public:
InnerLoopUnroller(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,unsigned UnrollFactor,LoopVectorizationLegality * LVL,LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,GeneratedRTChecks & Check)7960b57cec5SDimitry Andric InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
7970b57cec5SDimitry Andric LoopInfo *LI, DominatorTree *DT,
7980b57cec5SDimitry Andric const TargetLibraryInfo *TLI,
7990b57cec5SDimitry Andric const TargetTransformInfo *TTI, AssumptionCache *AC,
8000b57cec5SDimitry Andric OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
8010b57cec5SDimitry Andric LoopVectorizationLegality *LVL,
802e8d8bef9SDimitry Andric LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
803fe6060f1SDimitry Andric ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
804e8d8bef9SDimitry Andric : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
805753f127fSDimitry Andric ElementCount::getFixed(1),
806e8d8bef9SDimitry Andric ElementCount::getFixed(1), UnrollFactor, LVL, CM,
807fe6060f1SDimitry Andric BFI, PSI, Check) {}
8080b57cec5SDimitry Andric };
8090b57cec5SDimitry Andric
810e8d8bef9SDimitry Andric /// Encapsulate information regarding vectorization of a loop and its epilogue.
811e8d8bef9SDimitry Andric /// This information is meant to be updated and used across two stages of
812e8d8bef9SDimitry Andric /// epilogue vectorization.
813e8d8bef9SDimitry Andric struct EpilogueLoopVectorizationInfo {
814e8d8bef9SDimitry Andric ElementCount MainLoopVF = ElementCount::getFixed(0);
815e8d8bef9SDimitry Andric unsigned MainLoopUF = 0;
816e8d8bef9SDimitry Andric ElementCount EpilogueVF = ElementCount::getFixed(0);
817e8d8bef9SDimitry Andric unsigned EpilogueUF = 0;
818e8d8bef9SDimitry Andric BasicBlock *MainLoopIterationCountCheck = nullptr;
819e8d8bef9SDimitry Andric BasicBlock *EpilogueIterationCountCheck = nullptr;
820e8d8bef9SDimitry Andric BasicBlock *SCEVSafetyCheck = nullptr;
821e8d8bef9SDimitry Andric BasicBlock *MemSafetyCheck = nullptr;
822e8d8bef9SDimitry Andric Value *TripCount = nullptr;
823e8d8bef9SDimitry Andric Value *VectorTripCount = nullptr;
824e8d8bef9SDimitry Andric
EpilogueLoopVectorizationInfollvm::EpilogueLoopVectorizationInfo825349cc55cSDimitry Andric EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
826349cc55cSDimitry Andric ElementCount EVF, unsigned EUF)
827349cc55cSDimitry Andric : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
828e8d8bef9SDimitry Andric assert(EUF == 1 &&
829e8d8bef9SDimitry Andric "A high UF for the epilogue loop is likely not beneficial.");
830e8d8bef9SDimitry Andric }
831e8d8bef9SDimitry Andric };
832e8d8bef9SDimitry Andric
833e8d8bef9SDimitry Andric /// An extension of the inner loop vectorizer that creates a skeleton for a
834e8d8bef9SDimitry Andric /// vectorized loop that has its epilogue (residual) also vectorized.
835e8d8bef9SDimitry Andric /// The idea is to run the vplan on a given loop twice, firstly to setup the
836e8d8bef9SDimitry Andric /// skeleton and vectorize the main loop, and secondly to complete the skeleton
837e8d8bef9SDimitry Andric /// from the first step and vectorize the epilogue. This is achieved by
838e8d8bef9SDimitry Andric /// deriving two concrete strategy classes from this base class and invoking
839e8d8bef9SDimitry Andric /// them in succession from the loop vectorizer planner.
840e8d8bef9SDimitry Andric class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
841e8d8bef9SDimitry Andric public:
InnerLoopAndEpilogueVectorizer(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,EpilogueLoopVectorizationInfo & EPI,LoopVectorizationLegality * LVL,llvm::LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,GeneratedRTChecks & Checks)842e8d8bef9SDimitry Andric InnerLoopAndEpilogueVectorizer(
843e8d8bef9SDimitry Andric Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
844e8d8bef9SDimitry Andric DominatorTree *DT, const TargetLibraryInfo *TLI,
845e8d8bef9SDimitry Andric const TargetTransformInfo *TTI, AssumptionCache *AC,
846e8d8bef9SDimitry Andric OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
847e8d8bef9SDimitry Andric LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
848fe6060f1SDimitry Andric BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
849fe6060f1SDimitry Andric GeneratedRTChecks &Checks)
850e8d8bef9SDimitry Andric : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
851753f127fSDimitry Andric EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
852753f127fSDimitry Andric CM, BFI, PSI, Checks),
853e8d8bef9SDimitry Andric EPI(EPI) {}
854e8d8bef9SDimitry Andric
855e8d8bef9SDimitry Andric // Override this function to handle the more complex control flow around the
856e8d8bef9SDimitry Andric // three loops.
createVectorizedLoopSkeleton(const SCEV2ValueTy & ExpandedSCEVs)857fe013be4SDimitry Andric std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(
858fe013be4SDimitry Andric const SCEV2ValueTy &ExpandedSCEVs) final {
859fe013be4SDimitry Andric return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs);
860e8d8bef9SDimitry Andric }
861e8d8bef9SDimitry Andric
862e8d8bef9SDimitry Andric /// The interface for creating a vectorized skeleton using one of two
863e8d8bef9SDimitry Andric /// different strategies, each corresponding to one execution of the vplan
864e8d8bef9SDimitry Andric /// as described above.
86504eeddc0SDimitry Andric virtual std::pair<BasicBlock *, Value *>
866fe013be4SDimitry Andric createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0;
867e8d8bef9SDimitry Andric
868e8d8bef9SDimitry Andric /// Holds and updates state information required to vectorize the main loop
869e8d8bef9SDimitry Andric /// and its epilogue in two separate passes. This setup helps us avoid
870e8d8bef9SDimitry Andric /// regenerating and recomputing runtime safety checks. It also helps us to
871e8d8bef9SDimitry Andric /// shorten the iteration-count-check path length for the cases where the
872e8d8bef9SDimitry Andric /// iteration count of the loop is so small that the main vector loop is
873e8d8bef9SDimitry Andric /// completely skipped.
874e8d8bef9SDimitry Andric EpilogueLoopVectorizationInfo &EPI;
875e8d8bef9SDimitry Andric };
876e8d8bef9SDimitry Andric
877e8d8bef9SDimitry Andric /// A specialized derived class of inner loop vectorizer that performs
878e8d8bef9SDimitry Andric /// vectorization of *main* loops in the process of vectorizing loops and their
879e8d8bef9SDimitry Andric /// epilogues.
880e8d8bef9SDimitry Andric class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
881e8d8bef9SDimitry Andric public:
EpilogueVectorizerMainLoop(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,EpilogueLoopVectorizationInfo & EPI,LoopVectorizationLegality * LVL,llvm::LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,GeneratedRTChecks & Check)882e8d8bef9SDimitry Andric EpilogueVectorizerMainLoop(
883e8d8bef9SDimitry Andric Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
884e8d8bef9SDimitry Andric DominatorTree *DT, const TargetLibraryInfo *TLI,
885e8d8bef9SDimitry Andric const TargetTransformInfo *TTI, AssumptionCache *AC,
886e8d8bef9SDimitry Andric OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
887e8d8bef9SDimitry Andric LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
888fe6060f1SDimitry Andric BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
889fe6060f1SDimitry Andric GeneratedRTChecks &Check)
890e8d8bef9SDimitry Andric : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
891fe6060f1SDimitry Andric EPI, LVL, CM, BFI, PSI, Check) {}
892e8d8bef9SDimitry Andric /// Implements the interface for creating a vectorized skeleton using the
893e8d8bef9SDimitry Andric /// *main loop* strategy (ie the first pass of vplan execution).
894fe013be4SDimitry Andric std::pair<BasicBlock *, Value *>
895fe013be4SDimitry Andric createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
896e8d8bef9SDimitry Andric
897e8d8bef9SDimitry Andric protected:
898e8d8bef9SDimitry Andric /// Emits an iteration count bypass check once for the main loop (when \p
899e8d8bef9SDimitry Andric /// ForEpilogue is false) and once for the epilogue loop (when \p
900e8d8bef9SDimitry Andric /// ForEpilogue is true).
90181ad6265SDimitry Andric BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
902e8d8bef9SDimitry Andric void printDebugTracesAtStart() override;
903e8d8bef9SDimitry Andric void printDebugTracesAtEnd() override;
904e8d8bef9SDimitry Andric };
905e8d8bef9SDimitry Andric
906e8d8bef9SDimitry Andric // A specialized derived class of inner loop vectorizer that performs
907e8d8bef9SDimitry Andric // vectorization of *epilogue* loops in the process of vectorizing loops and
908e8d8bef9SDimitry Andric // their epilogues.
909e8d8bef9SDimitry Andric class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
910e8d8bef9SDimitry Andric public:
EpilogueVectorizerEpilogueLoop(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,EpilogueLoopVectorizationInfo & EPI,LoopVectorizationLegality * LVL,llvm::LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,GeneratedRTChecks & Checks)911fe6060f1SDimitry Andric EpilogueVectorizerEpilogueLoop(
912fe6060f1SDimitry Andric Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
913fe6060f1SDimitry Andric DominatorTree *DT, const TargetLibraryInfo *TLI,
914e8d8bef9SDimitry Andric const TargetTransformInfo *TTI, AssumptionCache *AC,
915fe6060f1SDimitry Andric OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
916fe6060f1SDimitry Andric LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
917fe6060f1SDimitry Andric BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
918fe6060f1SDimitry Andric GeneratedRTChecks &Checks)
919e8d8bef9SDimitry Andric : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
92081ad6265SDimitry Andric EPI, LVL, CM, BFI, PSI, Checks) {
92181ad6265SDimitry Andric TripCount = EPI.TripCount;
92281ad6265SDimitry Andric }
923e8d8bef9SDimitry Andric /// Implements the interface for creating a vectorized skeleton using the
924e8d8bef9SDimitry Andric /// *epilogue loop* strategy (ie the second pass of vplan execution).
925fe013be4SDimitry Andric std::pair<BasicBlock *, Value *>
926fe013be4SDimitry Andric createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
927e8d8bef9SDimitry Andric
928e8d8bef9SDimitry Andric protected:
929e8d8bef9SDimitry Andric /// Emits an iteration count bypass check after the main vector loop has
930e8d8bef9SDimitry Andric /// finished to see if there are any iterations left to execute by either
931e8d8bef9SDimitry Andric /// the vector epilogue or the scalar epilogue.
93281ad6265SDimitry Andric BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
933e8d8bef9SDimitry Andric BasicBlock *Bypass,
934e8d8bef9SDimitry Andric BasicBlock *Insert);
935e8d8bef9SDimitry Andric void printDebugTracesAtStart() override;
936e8d8bef9SDimitry Andric void printDebugTracesAtEnd() override;
937e8d8bef9SDimitry Andric };
9380b57cec5SDimitry Andric } // end namespace llvm
9390b57cec5SDimitry Andric
9400b57cec5SDimitry Andric /// Look for a meaningful debug location on the instruction or it's
9410b57cec5SDimitry Andric /// operands.
getDebugLocFromInstOrOperands(Instruction * I)942c9157d92SDimitry Andric static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) {
9430b57cec5SDimitry Andric if (!I)
944c9157d92SDimitry Andric return DebugLoc();
9450b57cec5SDimitry Andric
9460b57cec5SDimitry Andric DebugLoc Empty;
9470b57cec5SDimitry Andric if (I->getDebugLoc() != Empty)
948c9157d92SDimitry Andric return I->getDebugLoc();
9490b57cec5SDimitry Andric
950fe6060f1SDimitry Andric for (Use &Op : I->operands()) {
951fe6060f1SDimitry Andric if (Instruction *OpInst = dyn_cast<Instruction>(Op))
9520b57cec5SDimitry Andric if (OpInst->getDebugLoc() != Empty)
953c9157d92SDimitry Andric return OpInst->getDebugLoc();
9540b57cec5SDimitry Andric }
9550b57cec5SDimitry Andric
956c9157d92SDimitry Andric return I->getDebugLoc();
9570b57cec5SDimitry Andric }
9580b57cec5SDimitry Andric
959fe6060f1SDimitry Andric /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
960fe6060f1SDimitry Andric /// is passed, the message relates to that particular instruction.
9618bcb0991SDimitry Andric #ifndef NDEBUG
debugVectorizationMessage(const StringRef Prefix,const StringRef DebugMsg,Instruction * I)962fe6060f1SDimitry Andric static void debugVectorizationMessage(const StringRef Prefix,
963fe6060f1SDimitry Andric const StringRef DebugMsg,
9648bcb0991SDimitry Andric Instruction *I) {
965fe6060f1SDimitry Andric dbgs() << "LV: " << Prefix << DebugMsg;
9668bcb0991SDimitry Andric if (I != nullptr)
9678bcb0991SDimitry Andric dbgs() << " " << *I;
9688bcb0991SDimitry Andric else
9698bcb0991SDimitry Andric dbgs() << '.';
9708bcb0991SDimitry Andric dbgs() << '\n';
9718bcb0991SDimitry Andric }
9728bcb0991SDimitry Andric #endif
9738bcb0991SDimitry Andric
9748bcb0991SDimitry Andric /// Create an analysis remark that explains why vectorization failed
9758bcb0991SDimitry Andric ///
9768bcb0991SDimitry Andric /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
9778bcb0991SDimitry Andric /// RemarkName is the identifier for the remark. If \p I is passed it is an
9788bcb0991SDimitry Andric /// instruction that prevents vectorization. Otherwise \p TheLoop is used for
9798bcb0991SDimitry Andric /// the location of the remark. \return the remark object that can be
9808bcb0991SDimitry Andric /// streamed to.
createLVAnalysis(const char * PassName,StringRef RemarkName,Loop * TheLoop,Instruction * I)9818bcb0991SDimitry Andric static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
9828bcb0991SDimitry Andric StringRef RemarkName, Loop *TheLoop, Instruction *I) {
9838bcb0991SDimitry Andric Value *CodeRegion = TheLoop->getHeader();
9848bcb0991SDimitry Andric DebugLoc DL = TheLoop->getStartLoc();
9858bcb0991SDimitry Andric
9868bcb0991SDimitry Andric if (I) {
9878bcb0991SDimitry Andric CodeRegion = I->getParent();
9888bcb0991SDimitry Andric // If there is no debug location attached to the instruction, revert back to
9898bcb0991SDimitry Andric // using the loop's.
9908bcb0991SDimitry Andric if (I->getDebugLoc())
9918bcb0991SDimitry Andric DL = I->getDebugLoc();
9928bcb0991SDimitry Andric }
9938bcb0991SDimitry Andric
994fe6060f1SDimitry Andric return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
9958bcb0991SDimitry Andric }
9968bcb0991SDimitry Andric
99704eeddc0SDimitry Andric namespace llvm {
99804eeddc0SDimitry Andric
999e8d8bef9SDimitry Andric /// Return a value for Step multiplied by VF.
createStepForVF(IRBuilderBase & B,Type * Ty,ElementCount VF,int64_t Step)100081ad6265SDimitry Andric Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
1001349cc55cSDimitry Andric int64_t Step) {
1002349cc55cSDimitry Andric assert(Ty->isIntegerTy() && "Expected an integer step");
1003fe013be4SDimitry Andric return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step));
1004e8d8bef9SDimitry Andric }
1005e8d8bef9SDimitry Andric
1006fe6060f1SDimitry Andric /// Return the runtime value for VF.
getRuntimeVF(IRBuilderBase & B,Type * Ty,ElementCount VF)100781ad6265SDimitry Andric Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
1008fe013be4SDimitry Andric return B.CreateElementCount(Ty, VF);
1009fe6060f1SDimitry Andric }
1010fe6060f1SDimitry Andric
createTripCountSCEV(Type * IdxTy,PredicatedScalarEvolution & PSE,Loop * OrigLoop)1011fe013be4SDimitry Andric const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE,
1012fe013be4SDimitry Andric Loop *OrigLoop) {
1013bdd1243dSDimitry Andric const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
1014bdd1243dSDimitry Andric assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count");
1015bdd1243dSDimitry Andric
1016bdd1243dSDimitry Andric ScalarEvolution &SE = *PSE.getSE();
1017fe013be4SDimitry Andric return SE.getTripCountFromExitCount(BackedgeTakenCount, IdxTy, OrigLoop);
1018bdd1243dSDimitry Andric }
1019bdd1243dSDimitry Andric
reportVectorizationFailure(const StringRef DebugMsg,const StringRef OREMsg,const StringRef ORETag,OptimizationRemarkEmitter * ORE,Loop * TheLoop,Instruction * I)10208bcb0991SDimitry Andric void reportVectorizationFailure(const StringRef DebugMsg,
10218bcb0991SDimitry Andric const StringRef OREMsg, const StringRef ORETag,
1022fe6060f1SDimitry Andric OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1023fe6060f1SDimitry Andric Instruction *I) {
1024fe6060f1SDimitry Andric LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
10258bcb0991SDimitry Andric LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1026fe6060f1SDimitry Andric ORE->emit(
1027fe6060f1SDimitry Andric createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1028fe6060f1SDimitry Andric << "loop not vectorized: " << OREMsg);
1029fe6060f1SDimitry Andric }
1030fe6060f1SDimitry Andric
reportVectorizationInfo(const StringRef Msg,const StringRef ORETag,OptimizationRemarkEmitter * ORE,Loop * TheLoop,Instruction * I)1031fe6060f1SDimitry Andric void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1032fe6060f1SDimitry Andric OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1033fe6060f1SDimitry Andric Instruction *I) {
1034fe6060f1SDimitry Andric LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
1035fe6060f1SDimitry Andric LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1036fe6060f1SDimitry Andric ORE->emit(
1037fe6060f1SDimitry Andric createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1038fe6060f1SDimitry Andric << Msg);
10398bcb0991SDimitry Andric }
10408bcb0991SDimitry Andric
1041c9157d92SDimitry Andric /// Report successful vectorization of the loop. In case an outer loop is
1042c9157d92SDimitry Andric /// vectorized, prepend "outer" to the vectorization remark.
reportVectorization(OptimizationRemarkEmitter * ORE,Loop * TheLoop,VectorizationFactor VF,unsigned IC)1043c9157d92SDimitry Andric static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1044c9157d92SDimitry Andric VectorizationFactor VF, unsigned IC) {
1045c9157d92SDimitry Andric LLVM_DEBUG(debugVectorizationMessage(
1046c9157d92SDimitry Andric "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
1047c9157d92SDimitry Andric nullptr));
1048c9157d92SDimitry Andric StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
1049c9157d92SDimitry Andric ORE->emit([&]() {
1050c9157d92SDimitry Andric return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
1051c9157d92SDimitry Andric TheLoop->getHeader())
1052c9157d92SDimitry Andric << "vectorized " << LoopType << "loop (vectorization width: "
1053c9157d92SDimitry Andric << ore::NV("VectorizationFactor", VF.Width)
1054c9157d92SDimitry Andric << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
1055c9157d92SDimitry Andric });
1056c9157d92SDimitry Andric }
1057c9157d92SDimitry Andric
10588bcb0991SDimitry Andric } // end namespace llvm
10598bcb0991SDimitry Andric
10600b57cec5SDimitry Andric #ifndef NDEBUG
10610b57cec5SDimitry Andric /// \return string containing a file name and a line # for the given loop.
getDebugLocString(const Loop * L)10620b57cec5SDimitry Andric static std::string getDebugLocString(const Loop *L) {
10630b57cec5SDimitry Andric std::string Result;
10640b57cec5SDimitry Andric if (L) {
10650b57cec5SDimitry Andric raw_string_ostream OS(Result);
10660b57cec5SDimitry Andric if (const DebugLoc LoopDbgLoc = L->getStartLoc())
10670b57cec5SDimitry Andric LoopDbgLoc.print(OS);
10680b57cec5SDimitry Andric else
10690b57cec5SDimitry Andric // Just print the module name.
10700b57cec5SDimitry Andric OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
10710b57cec5SDimitry Andric OS.flush();
10720b57cec5SDimitry Andric }
10730b57cec5SDimitry Andric return Result;
10740b57cec5SDimitry Andric }
10750b57cec5SDimitry Andric #endif
10760b57cec5SDimitry Andric
collectPoisonGeneratingRecipes(VPTransformState & State)10774824e7fdSDimitry Andric void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
10784824e7fdSDimitry Andric VPTransformState &State) {
10794824e7fdSDimitry Andric
10804824e7fdSDimitry Andric // Collect recipes in the backward slice of `Root` that may generate a poison
10814824e7fdSDimitry Andric // value that is used after vectorization.
10824824e7fdSDimitry Andric SmallPtrSet<VPRecipeBase *, 16> Visited;
10834824e7fdSDimitry Andric auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
10844824e7fdSDimitry Andric SmallVector<VPRecipeBase *, 16> Worklist;
10854824e7fdSDimitry Andric Worklist.push_back(Root);
10864824e7fdSDimitry Andric
10874824e7fdSDimitry Andric // Traverse the backward slice of Root through its use-def chain.
10884824e7fdSDimitry Andric while (!Worklist.empty()) {
10894824e7fdSDimitry Andric VPRecipeBase *CurRec = Worklist.back();
10904824e7fdSDimitry Andric Worklist.pop_back();
10914824e7fdSDimitry Andric
10924824e7fdSDimitry Andric if (!Visited.insert(CurRec).second)
10934824e7fdSDimitry Andric continue;
10944824e7fdSDimitry Andric
10954824e7fdSDimitry Andric // Prune search if we find another recipe generating a widen memory
10964824e7fdSDimitry Andric // instruction. Widen memory instructions involved in address computation
10974824e7fdSDimitry Andric // will lead to gather/scatter instructions, which don't need to be
10984824e7fdSDimitry Andric // handled.
10994824e7fdSDimitry Andric if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
110004eeddc0SDimitry Andric isa<VPInterleaveRecipe>(CurRec) ||
110181ad6265SDimitry Andric isa<VPScalarIVStepsRecipe>(CurRec) ||
1102753f127fSDimitry Andric isa<VPCanonicalIVPHIRecipe>(CurRec) ||
1103753f127fSDimitry Andric isa<VPActiveLaneMaskPHIRecipe>(CurRec))
11044824e7fdSDimitry Andric continue;
11054824e7fdSDimitry Andric
11064824e7fdSDimitry Andric // This recipe contributes to the address computation of a widen
1107fe013be4SDimitry Andric // load/store. If the underlying instruction has poison-generating flags,
1108fe013be4SDimitry Andric // drop them directly.
1109fe013be4SDimitry Andric if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
1110fe013be4SDimitry Andric RecWithFlags->dropPoisonGeneratingFlags();
1111fe013be4SDimitry Andric } else {
1112c9157d92SDimitry Andric Instruction *Instr = dyn_cast_or_null<Instruction>(
1113c9157d92SDimitry Andric CurRec->getVPSingleValue()->getUnderlyingValue());
1114fe013be4SDimitry Andric (void)Instr;
1115fe013be4SDimitry Andric assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
1116fe013be4SDimitry Andric "found instruction with poison generating flags not covered by "
1117fe013be4SDimitry Andric "VPRecipeWithIRFlags");
1118fe013be4SDimitry Andric }
11194824e7fdSDimitry Andric
11204824e7fdSDimitry Andric // Add new definitions to the worklist.
11214824e7fdSDimitry Andric for (VPValue *operand : CurRec->operands())
1122bdd1243dSDimitry Andric if (VPRecipeBase *OpDef = operand->getDefiningRecipe())
1123bdd1243dSDimitry Andric Worklist.push_back(OpDef);
11244824e7fdSDimitry Andric }
11254824e7fdSDimitry Andric });
11264824e7fdSDimitry Andric
11274824e7fdSDimitry Andric // Traverse all the recipes in the VPlan and collect the poison-generating
11284824e7fdSDimitry Andric // recipes in the backward slice starting at the address of a VPWidenRecipe or
11294824e7fdSDimitry Andric // VPInterleaveRecipe.
1130bdd1243dSDimitry Andric auto Iter = vp_depth_first_deep(State.Plan->getEntry());
11314824e7fdSDimitry Andric for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
11324824e7fdSDimitry Andric for (VPRecipeBase &Recipe : *VPBB) {
11334824e7fdSDimitry Andric if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
113481ad6265SDimitry Andric Instruction &UnderlyingInstr = WidenRec->getIngredient();
1135bdd1243dSDimitry Andric VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
113681ad6265SDimitry Andric if (AddrDef && WidenRec->isConsecutive() &&
113781ad6265SDimitry Andric Legal->blockNeedsPredication(UnderlyingInstr.getParent()))
1138bdd1243dSDimitry Andric collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
11394824e7fdSDimitry Andric } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
1140bdd1243dSDimitry Andric VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
11414824e7fdSDimitry Andric if (AddrDef) {
11424824e7fdSDimitry Andric // Check if any member of the interleave group needs predication.
11434824e7fdSDimitry Andric const InterleaveGroup<Instruction> *InterGroup =
11444824e7fdSDimitry Andric InterleaveRec->getInterleaveGroup();
11454824e7fdSDimitry Andric bool NeedPredication = false;
11464824e7fdSDimitry Andric for (int I = 0, NumMembers = InterGroup->getNumMembers();
11474824e7fdSDimitry Andric I < NumMembers; ++I) {
11484824e7fdSDimitry Andric Instruction *Member = InterGroup->getMember(I);
11494824e7fdSDimitry Andric if (Member)
11504824e7fdSDimitry Andric NeedPredication |=
11514824e7fdSDimitry Andric Legal->blockNeedsPredication(Member->getParent());
11524824e7fdSDimitry Andric }
11534824e7fdSDimitry Andric
11544824e7fdSDimitry Andric if (NeedPredication)
1155bdd1243dSDimitry Andric collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
11564824e7fdSDimitry Andric }
11574824e7fdSDimitry Andric }
11584824e7fdSDimitry Andric }
11594824e7fdSDimitry Andric }
11604824e7fdSDimitry Andric }
11614824e7fdSDimitry Andric
11620b57cec5SDimitry Andric namespace llvm {
11630b57cec5SDimitry Andric
11648bcb0991SDimitry Andric // Loop vectorization cost-model hints how the scalar epilogue loop should be
11658bcb0991SDimitry Andric // lowered.
11668bcb0991SDimitry Andric enum ScalarEpilogueLowering {
11678bcb0991SDimitry Andric
11688bcb0991SDimitry Andric // The default: allowing scalar epilogues.
11698bcb0991SDimitry Andric CM_ScalarEpilogueAllowed,
11708bcb0991SDimitry Andric
11718bcb0991SDimitry Andric // Vectorization with OptForSize: don't allow epilogues.
11728bcb0991SDimitry Andric CM_ScalarEpilogueNotAllowedOptSize,
11738bcb0991SDimitry Andric
11748bcb0991SDimitry Andric // A special case of vectorisation with OptForSize: loops with a very small
11758bcb0991SDimitry Andric // trip count are considered for vectorization under OptForSize, thereby
11768bcb0991SDimitry Andric // making sure the cost of their loop body is dominant, free of runtime
11778bcb0991SDimitry Andric // guards and scalar iteration overheads.
11788bcb0991SDimitry Andric CM_ScalarEpilogueNotAllowedLowTripLoop,
11798bcb0991SDimitry Andric
11808bcb0991SDimitry Andric // Loop hint predicate indicating an epilogue is undesired.
1181e8d8bef9SDimitry Andric CM_ScalarEpilogueNotNeededUsePredicate,
1182e8d8bef9SDimitry Andric
1183e8d8bef9SDimitry Andric // Directive indicating we must either tail fold or not vectorize
1184e8d8bef9SDimitry Andric CM_ScalarEpilogueNotAllowedUsePredicate
11858bcb0991SDimitry Andric };
11868bcb0991SDimitry Andric
1187fe013be4SDimitry Andric using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1188fe6060f1SDimitry Andric
11890b57cec5SDimitry Andric /// LoopVectorizationCostModel - estimates the expected speedups due to
11900b57cec5SDimitry Andric /// vectorization.
11910b57cec5SDimitry Andric /// In many cases vectorization is not profitable. This can happen because of
11920b57cec5SDimitry Andric /// a number of reasons. In this class we mainly attempt to predict the
11930b57cec5SDimitry Andric /// expected speedup/slowdowns due to the supported instruction set. We use the
11940b57cec5SDimitry Andric /// TargetTransformInfo to query the different backends for the cost of
11950b57cec5SDimitry Andric /// different operations.
11960b57cec5SDimitry Andric class LoopVectorizationCostModel {
11970b57cec5SDimitry Andric public:
LoopVectorizationCostModel(ScalarEpilogueLowering SEL,Loop * L,PredicatedScalarEvolution & PSE,LoopInfo * LI,LoopVectorizationLegality * Legal,const TargetTransformInfo & TTI,const TargetLibraryInfo * TLI,DemandedBits * DB,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,const Function * F,const LoopVectorizeHints * Hints,InterleavedAccessInfo & IAI)11988bcb0991SDimitry Andric LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
11998bcb0991SDimitry Andric PredicatedScalarEvolution &PSE, LoopInfo *LI,
12008bcb0991SDimitry Andric LoopVectorizationLegality *Legal,
12010b57cec5SDimitry Andric const TargetTransformInfo &TTI,
12020b57cec5SDimitry Andric const TargetLibraryInfo *TLI, DemandedBits *DB,
12030b57cec5SDimitry Andric AssumptionCache *AC,
12040b57cec5SDimitry Andric OptimizationRemarkEmitter *ORE, const Function *F,
12050b57cec5SDimitry Andric const LoopVectorizeHints *Hints,
12060b57cec5SDimitry Andric InterleavedAccessInfo &IAI)
12078bcb0991SDimitry Andric : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
12088bcb0991SDimitry Andric TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
12098bcb0991SDimitry Andric Hints(Hints), InterleaveInfo(IAI) {}
12100b57cec5SDimitry Andric
1211fe6060f1SDimitry Andric /// \return An upper bound for the vectorization factors (both fixed and
1212fe6060f1SDimitry Andric /// scalable). If the factors are 0, vectorization and interleaving should be
1213fe6060f1SDimitry Andric /// avoided up front.
1214fe6060f1SDimitry Andric FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
12158bcb0991SDimitry Andric
12168bcb0991SDimitry Andric /// \return True if runtime checks are required for vectorization, and false
12178bcb0991SDimitry Andric /// otherwise.
12188bcb0991SDimitry Andric bool runtimeChecksRequired();
12190b57cec5SDimitry Andric
12200b57cec5SDimitry Andric /// Setup cost-based decisions for user vectorization factor.
1221fe6060f1SDimitry Andric /// \return true if the UserVF is a feasible VF to be chosen.
selectUserVectorizationFactor(ElementCount UserVF)1222fe6060f1SDimitry Andric bool selectUserVectorizationFactor(ElementCount UserVF) {
12230b57cec5SDimitry Andric collectUniformsAndScalars(UserVF);
12240b57cec5SDimitry Andric collectInstsToScalarize(UserVF);
1225fe6060f1SDimitry Andric return expectedCost(UserVF).first.isValid();
12260b57cec5SDimitry Andric }
12270b57cec5SDimitry Andric
12280b57cec5SDimitry Andric /// \return The size (in bits) of the smallest and widest types in the code
12290b57cec5SDimitry Andric /// that needs to be vectorized. We ignore values that remain scalar such as
12300b57cec5SDimitry Andric /// 64 bit loop indices.
12310b57cec5SDimitry Andric std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
12320b57cec5SDimitry Andric
12330b57cec5SDimitry Andric /// \return The desired interleave count.
12340b57cec5SDimitry Andric /// If interleave count has been specified by metadata it will be returned.
12350b57cec5SDimitry Andric /// Otherwise, the interleave count is computed and returned. VF and LoopCost
12360b57cec5SDimitry Andric /// are the selected vectorization factor and the cost of the selected VF.
1237bdd1243dSDimitry Andric unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
12380b57cec5SDimitry Andric
12390b57cec5SDimitry Andric /// Memory access instruction may be vectorized in more than one way.
12400b57cec5SDimitry Andric /// Form of instruction after vectorization depends on cost.
12410b57cec5SDimitry Andric /// This function takes cost-based decisions for Load/Store instructions
12420b57cec5SDimitry Andric /// and collects them in a map. This decisions map is used for building
12430b57cec5SDimitry Andric /// the lists of loop-uniform and loop-scalar instructions.
12440b57cec5SDimitry Andric /// The calculated cost is saved with widening decision in order to
12450b57cec5SDimitry Andric /// avoid redundant calculations.
1246e8d8bef9SDimitry Andric void setCostBasedWideningDecision(ElementCount VF);
12470b57cec5SDimitry Andric
1248c9157d92SDimitry Andric /// A call may be vectorized in different ways depending on whether we have
1249c9157d92SDimitry Andric /// vectorized variants available and whether the target supports masking.
1250c9157d92SDimitry Andric /// This function analyzes all calls in the function at the supplied VF,
1251c9157d92SDimitry Andric /// makes a decision based on the costs of available options, and stores that
1252c9157d92SDimitry Andric /// decision in a map for use in planning and plan execution.
1253c9157d92SDimitry Andric void setVectorizedCallDecision(ElementCount VF);
1254c9157d92SDimitry Andric
12550b57cec5SDimitry Andric /// A struct that represents some properties of the register usage
12560b57cec5SDimitry Andric /// of a loop.
12570b57cec5SDimitry Andric struct RegisterUsage {
12580b57cec5SDimitry Andric /// Holds the number of loop invariant values that are used in the loop.
12598bcb0991SDimitry Andric /// The key is ClassID of target-provided register class.
12608bcb0991SDimitry Andric SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
12610b57cec5SDimitry Andric /// Holds the maximum number of concurrent live intervals in the loop.
12628bcb0991SDimitry Andric /// The key is ClassID of target-provided register class.
12638bcb0991SDimitry Andric SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
12640b57cec5SDimitry Andric };
12650b57cec5SDimitry Andric
12660b57cec5SDimitry Andric /// \return Returns information about the register usages of the loop for the
12670b57cec5SDimitry Andric /// given vectorization factors.
1268e8d8bef9SDimitry Andric SmallVector<RegisterUsage, 8>
1269e8d8bef9SDimitry Andric calculateRegisterUsage(ArrayRef<ElementCount> VFs);
12700b57cec5SDimitry Andric
12710b57cec5SDimitry Andric /// Collect values we want to ignore in the cost model.
12720b57cec5SDimitry Andric void collectValuesToIgnore();
12730b57cec5SDimitry Andric
1274fe6060f1SDimitry Andric /// Collect all element types in the loop for which widening is needed.
1275fe6060f1SDimitry Andric void collectElementTypesForWidening();
1276fe6060f1SDimitry Andric
1277e8d8bef9SDimitry Andric /// Split reductions into those that happen in the loop, and those that happen
1278c9157d92SDimitry Andric /// outside. In loop reductions are collected into InLoopReductions.
1279e8d8bef9SDimitry Andric void collectInLoopReductions();
1280e8d8bef9SDimitry Andric
1281fe6060f1SDimitry Andric /// Returns true if we should use strict in-order reductions for the given
1282fe6060f1SDimitry Andric /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1283fe6060f1SDimitry Andric /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1284fe6060f1SDimitry Andric /// of FP operations.
useOrderedReductions(const RecurrenceDescriptor & RdxDesc) const128581ad6265SDimitry Andric bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1286349cc55cSDimitry Andric return !Hints->allowReordering() && RdxDesc.isOrdered();
1287fe6060f1SDimitry Andric }
1288fe6060f1SDimitry Andric
12890b57cec5SDimitry Andric /// \returns The smallest bitwidth each instruction can be represented with.
12900b57cec5SDimitry Andric /// The vector equivalents of these instructions should be truncated to this
12910b57cec5SDimitry Andric /// type.
getMinimalBitwidths() const12920b57cec5SDimitry Andric const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
12930b57cec5SDimitry Andric return MinBWs;
12940b57cec5SDimitry Andric }
12950b57cec5SDimitry Andric
12960b57cec5SDimitry Andric /// \returns True if it is more profitable to scalarize instruction \p I for
12970b57cec5SDimitry Andric /// vectorization factor \p VF.
isProfitableToScalarize(Instruction * I,ElementCount VF) const1298e8d8bef9SDimitry Andric bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1299e8d8bef9SDimitry Andric assert(VF.isVector() &&
1300e8d8bef9SDimitry Andric "Profitable to scalarize relevant only for VF > 1.");
13010b57cec5SDimitry Andric
13020b57cec5SDimitry Andric // Cost model is not run in the VPlan-native path - return conservative
13030b57cec5SDimitry Andric // result until this changes.
13040b57cec5SDimitry Andric if (EnableVPlanNativePath)
13050b57cec5SDimitry Andric return false;
13060b57cec5SDimitry Andric
13070b57cec5SDimitry Andric auto Scalars = InstsToScalarize.find(VF);
13080b57cec5SDimitry Andric assert(Scalars != InstsToScalarize.end() &&
13090b57cec5SDimitry Andric "VF not yet analyzed for scalarization profitability");
1310fe013be4SDimitry Andric return Scalars->second.contains(I);
13110b57cec5SDimitry Andric }
13120b57cec5SDimitry Andric
13130b57cec5SDimitry Andric /// Returns true if \p I is known to be uniform after vectorization.
isUniformAfterVectorization(Instruction * I,ElementCount VF) const1314e8d8bef9SDimitry Andric bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1315fe013be4SDimitry Andric // Pseudo probe needs to be duplicated for each unrolled iteration and
1316fe013be4SDimitry Andric // vector lane so that profiled loop trip count can be accurately
1317fe013be4SDimitry Andric // accumulated instead of being under counted.
1318fe013be4SDimitry Andric if (isa<PseudoProbeInst>(I))
1319fe013be4SDimitry Andric return false;
1320fe013be4SDimitry Andric
1321e8d8bef9SDimitry Andric if (VF.isScalar())
13220b57cec5SDimitry Andric return true;
13230b57cec5SDimitry Andric
13240b57cec5SDimitry Andric // Cost model is not run in the VPlan-native path - return conservative
13250b57cec5SDimitry Andric // result until this changes.
13260b57cec5SDimitry Andric if (EnableVPlanNativePath)
13270b57cec5SDimitry Andric return false;
13280b57cec5SDimitry Andric
13290b57cec5SDimitry Andric auto UniformsPerVF = Uniforms.find(VF);
13300b57cec5SDimitry Andric assert(UniformsPerVF != Uniforms.end() &&
13310b57cec5SDimitry Andric "VF not yet analyzed for uniformity");
13325ffd83dbSDimitry Andric return UniformsPerVF->second.count(I);
13330b57cec5SDimitry Andric }
13340b57cec5SDimitry Andric
13350b57cec5SDimitry Andric /// Returns true if \p I is known to be scalar after vectorization.
isScalarAfterVectorization(Instruction * I,ElementCount VF) const1336e8d8bef9SDimitry Andric bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1337e8d8bef9SDimitry Andric if (VF.isScalar())
13380b57cec5SDimitry Andric return true;
13390b57cec5SDimitry Andric
13400b57cec5SDimitry Andric // Cost model is not run in the VPlan-native path - return conservative
13410b57cec5SDimitry Andric // result until this changes.
13420b57cec5SDimitry Andric if (EnableVPlanNativePath)
13430b57cec5SDimitry Andric return false;
13440b57cec5SDimitry Andric
13450b57cec5SDimitry Andric auto ScalarsPerVF = Scalars.find(VF);
13460b57cec5SDimitry Andric assert(ScalarsPerVF != Scalars.end() &&
13470b57cec5SDimitry Andric "Scalar values are not calculated for VF");
13485ffd83dbSDimitry Andric return ScalarsPerVF->second.count(I);
13490b57cec5SDimitry Andric }
13500b57cec5SDimitry Andric
13510b57cec5SDimitry Andric /// \returns True if instruction \p I can be truncated to a smaller bitwidth
13520b57cec5SDimitry Andric /// for vectorization factor \p VF.
canTruncateToMinimalBitwidth(Instruction * I,ElementCount VF) const1353e8d8bef9SDimitry Andric bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1354fe013be4SDimitry Andric return VF.isVector() && MinBWs.contains(I) &&
13550b57cec5SDimitry Andric !isProfitableToScalarize(I, VF) &&
13560b57cec5SDimitry Andric !isScalarAfterVectorization(I, VF);
13570b57cec5SDimitry Andric }
13580b57cec5SDimitry Andric
13590b57cec5SDimitry Andric /// Decision that was taken during cost calculation for memory instruction.
13600b57cec5SDimitry Andric enum InstWidening {
13610b57cec5SDimitry Andric CM_Unknown,
13620b57cec5SDimitry Andric CM_Widen, // For consecutive accesses with stride +1.
13630b57cec5SDimitry Andric CM_Widen_Reverse, // For consecutive accesses with stride -1.
13640b57cec5SDimitry Andric CM_Interleave,
13650b57cec5SDimitry Andric CM_GatherScatter,
1366c9157d92SDimitry Andric CM_Scalarize,
1367c9157d92SDimitry Andric CM_VectorCall,
1368c9157d92SDimitry Andric CM_IntrinsicCall
13690b57cec5SDimitry Andric };
13700b57cec5SDimitry Andric
13710b57cec5SDimitry Andric /// Save vectorization decision \p W and \p Cost taken by the cost model for
13720b57cec5SDimitry Andric /// instruction \p I and vector width \p VF.
setWideningDecision(Instruction * I,ElementCount VF,InstWidening W,InstructionCost Cost)1373e8d8bef9SDimitry Andric void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1374e8d8bef9SDimitry Andric InstructionCost Cost) {
1375e8d8bef9SDimitry Andric assert(VF.isVector() && "Expected VF >=2");
13760b57cec5SDimitry Andric WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
13770b57cec5SDimitry Andric }
13780b57cec5SDimitry Andric
13790b57cec5SDimitry Andric /// Save vectorization decision \p W and \p Cost taken by the cost model for
13800b57cec5SDimitry Andric /// interleaving group \p Grp and vector width \p VF.
setWideningDecision(const InterleaveGroup<Instruction> * Grp,ElementCount VF,InstWidening W,InstructionCost Cost)1381e8d8bef9SDimitry Andric void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1382e8d8bef9SDimitry Andric ElementCount VF, InstWidening W,
1383e8d8bef9SDimitry Andric InstructionCost Cost) {
1384e8d8bef9SDimitry Andric assert(VF.isVector() && "Expected VF >=2");
13850b57cec5SDimitry Andric /// Broadcast this decicion to all instructions inside the group.
13860b57cec5SDimitry Andric /// But the cost will be assigned to one instruction only.
13870b57cec5SDimitry Andric for (unsigned i = 0; i < Grp->getFactor(); ++i) {
13880b57cec5SDimitry Andric if (auto *I = Grp->getMember(i)) {
13890b57cec5SDimitry Andric if (Grp->getInsertPos() == I)
13900b57cec5SDimitry Andric WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
13910b57cec5SDimitry Andric else
13920b57cec5SDimitry Andric WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
13930b57cec5SDimitry Andric }
13940b57cec5SDimitry Andric }
13950b57cec5SDimitry Andric }
13960b57cec5SDimitry Andric
13970b57cec5SDimitry Andric /// Return the cost model decision for the given instruction \p I and vector
13980b57cec5SDimitry Andric /// width \p VF. Return CM_Unknown if this instruction did not pass
13990b57cec5SDimitry Andric /// through the cost modeling.
getWideningDecision(Instruction * I,ElementCount VF) const1400fe6060f1SDimitry Andric InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1401e8d8bef9SDimitry Andric assert(VF.isVector() && "Expected VF to be a vector VF");
14020b57cec5SDimitry Andric // Cost model is not run in the VPlan-native path - return conservative
14030b57cec5SDimitry Andric // result until this changes.
14040b57cec5SDimitry Andric if (EnableVPlanNativePath)
14050b57cec5SDimitry Andric return CM_GatherScatter;
14060b57cec5SDimitry Andric
1407e8d8bef9SDimitry Andric std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
14080b57cec5SDimitry Andric auto Itr = WideningDecisions.find(InstOnVF);
14090b57cec5SDimitry Andric if (Itr == WideningDecisions.end())
14100b57cec5SDimitry Andric return CM_Unknown;
14110b57cec5SDimitry Andric return Itr->second.first;
14120b57cec5SDimitry Andric }
14130b57cec5SDimitry Andric
14140b57cec5SDimitry Andric /// Return the vectorization cost for the given instruction \p I and vector
14150b57cec5SDimitry Andric /// width \p VF.
getWideningCost(Instruction * I,ElementCount VF)1416e8d8bef9SDimitry Andric InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1417e8d8bef9SDimitry Andric assert(VF.isVector() && "Expected VF >=2");
1418e8d8bef9SDimitry Andric std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1419fe013be4SDimitry Andric assert(WideningDecisions.contains(InstOnVF) &&
14200b57cec5SDimitry Andric "The cost is not calculated");
14210b57cec5SDimitry Andric return WideningDecisions[InstOnVF].second;
14220b57cec5SDimitry Andric }
14230b57cec5SDimitry Andric
1424c9157d92SDimitry Andric struct CallWideningDecision {
1425c9157d92SDimitry Andric InstWidening Kind;
1426c9157d92SDimitry Andric Function *Variant;
1427c9157d92SDimitry Andric Intrinsic::ID IID;
1428c9157d92SDimitry Andric std::optional<unsigned> MaskPos;
1429c9157d92SDimitry Andric InstructionCost Cost;
1430c9157d92SDimitry Andric };
1431c9157d92SDimitry Andric
setCallWideningDecision(CallInst * CI,ElementCount VF,InstWidening Kind,Function * Variant,Intrinsic::ID IID,std::optional<unsigned> MaskPos,InstructionCost Cost)1432c9157d92SDimitry Andric void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind,
1433c9157d92SDimitry Andric Function *Variant, Intrinsic::ID IID,
1434c9157d92SDimitry Andric std::optional<unsigned> MaskPos,
1435c9157d92SDimitry Andric InstructionCost Cost) {
1436c9157d92SDimitry Andric assert(!VF.isScalar() && "Expected vector VF");
1437c9157d92SDimitry Andric CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID,
1438c9157d92SDimitry Andric MaskPos, Cost};
1439c9157d92SDimitry Andric }
1440c9157d92SDimitry Andric
getCallWideningDecision(CallInst * CI,ElementCount VF) const1441c9157d92SDimitry Andric CallWideningDecision getCallWideningDecision(CallInst *CI,
1442c9157d92SDimitry Andric ElementCount VF) const {
1443c9157d92SDimitry Andric assert(!VF.isScalar() && "Expected vector VF");
1444c9157d92SDimitry Andric return CallWideningDecisions.at(std::make_pair(CI, VF));
1445c9157d92SDimitry Andric }
1446c9157d92SDimitry Andric
14470b57cec5SDimitry Andric /// Return True if instruction \p I is an optimizable truncate whose operand
14480b57cec5SDimitry Andric /// is an induction variable. Such a truncate will be removed by adding a new
14490b57cec5SDimitry Andric /// induction variable with the destination type.
isOptimizableIVTruncate(Instruction * I,ElementCount VF)1450e8d8bef9SDimitry Andric bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
14510b57cec5SDimitry Andric // If the instruction is not a truncate, return false.
14520b57cec5SDimitry Andric auto *Trunc = dyn_cast<TruncInst>(I);
14530b57cec5SDimitry Andric if (!Trunc)
14540b57cec5SDimitry Andric return false;
14550b57cec5SDimitry Andric
14560b57cec5SDimitry Andric // Get the source and destination types of the truncate.
14570b57cec5SDimitry Andric Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
14580b57cec5SDimitry Andric Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
14590b57cec5SDimitry Andric
14600b57cec5SDimitry Andric // If the truncate is free for the given types, return false. Replacing a
14610b57cec5SDimitry Andric // free truncate with an induction variable would add an induction variable
14620b57cec5SDimitry Andric // update instruction to each iteration of the loop. We exclude from this
14630b57cec5SDimitry Andric // check the primary induction variable since it will need an update
14640b57cec5SDimitry Andric // instruction regardless.
14650b57cec5SDimitry Andric Value *Op = Trunc->getOperand(0);
14660b57cec5SDimitry Andric if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
14670b57cec5SDimitry Andric return false;
14680b57cec5SDimitry Andric
14690b57cec5SDimitry Andric // If the truncated value is not an induction variable, return false.
14700b57cec5SDimitry Andric return Legal->isInductionPhi(Op);
14710b57cec5SDimitry Andric }
14720b57cec5SDimitry Andric
14730b57cec5SDimitry Andric /// Collects the instructions to scalarize for each predicated instruction in
14740b57cec5SDimitry Andric /// the loop.
1475e8d8bef9SDimitry Andric void collectInstsToScalarize(ElementCount VF);
14760b57cec5SDimitry Andric
14770b57cec5SDimitry Andric /// Collect Uniform and Scalar values for the given \p VF.
14780b57cec5SDimitry Andric /// The sets depend on CM decision for Load/Store instructions
14790b57cec5SDimitry Andric /// that may be vectorized as interleave, gather-scatter or scalarized.
1480c9157d92SDimitry Andric /// Also make a decision on what to do about call instructions in the loop
1481c9157d92SDimitry Andric /// at that VF -- scalarize, call a known vector routine, or call a
1482c9157d92SDimitry Andric /// vector intrinsic.
collectUniformsAndScalars(ElementCount VF)1483e8d8bef9SDimitry Andric void collectUniformsAndScalars(ElementCount VF) {
14840b57cec5SDimitry Andric // Do the analysis once.
1485fe013be4SDimitry Andric if (VF.isScalar() || Uniforms.contains(VF))
14860b57cec5SDimitry Andric return;
14870b57cec5SDimitry Andric setCostBasedWideningDecision(VF);
1488c9157d92SDimitry Andric setVectorizedCallDecision(VF);
14890b57cec5SDimitry Andric collectLoopUniforms(VF);
14900b57cec5SDimitry Andric collectLoopScalars(VF);
14910b57cec5SDimitry Andric }
14920b57cec5SDimitry Andric
14930b57cec5SDimitry Andric /// Returns true if the target machine supports masked store operation
14940b57cec5SDimitry Andric /// for the given \p DataType and kind of access to \p Ptr.
isLegalMaskedStore(Type * DataType,Value * Ptr,Align Alignment) const1495fe6060f1SDimitry Andric bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1496349cc55cSDimitry Andric return Legal->isConsecutivePtr(DataType, Ptr) &&
14978bcb0991SDimitry Andric TTI.isLegalMaskedStore(DataType, Alignment);
14980b57cec5SDimitry Andric }
14990b57cec5SDimitry Andric
15000b57cec5SDimitry Andric /// Returns true if the target machine supports masked load operation
15010b57cec5SDimitry Andric /// for the given \p DataType and kind of access to \p Ptr.
isLegalMaskedLoad(Type * DataType,Value * Ptr,Align Alignment) const1502fe6060f1SDimitry Andric bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1503349cc55cSDimitry Andric return Legal->isConsecutivePtr(DataType, Ptr) &&
15048bcb0991SDimitry Andric TTI.isLegalMaskedLoad(DataType, Alignment);
15050b57cec5SDimitry Andric }
15060b57cec5SDimitry Andric
15070b57cec5SDimitry Andric /// Returns true if the target machine can represent \p V as a masked gather
15080b57cec5SDimitry Andric /// or scatter operation.
isLegalGatherOrScatter(Value * V,ElementCount VF)1509fe013be4SDimitry Andric bool isLegalGatherOrScatter(Value *V, ElementCount VF) {
15100b57cec5SDimitry Andric bool LI = isa<LoadInst>(V);
15110b57cec5SDimitry Andric bool SI = isa<StoreInst>(V);
15120b57cec5SDimitry Andric if (!LI && !SI)
15130b57cec5SDimitry Andric return false;
1514fe6060f1SDimitry Andric auto *Ty = getLoadStoreType(V);
15155ffd83dbSDimitry Andric Align Align = getLoadStoreAlignment(V);
151604eeddc0SDimitry Andric if (VF.isVector())
151704eeddc0SDimitry Andric Ty = VectorType::get(Ty, VF);
1518fe6060f1SDimitry Andric return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1519fe6060f1SDimitry Andric (SI && TTI.isLegalMaskedScatter(Ty, Align));
1520fe6060f1SDimitry Andric }
1521fe6060f1SDimitry Andric
1522fe6060f1SDimitry Andric /// Returns true if the target machine supports all of the reduction
1523fe6060f1SDimitry Andric /// variables found for the given VF.
canVectorizeReductions(ElementCount VF) const1524fe6060f1SDimitry Andric bool canVectorizeReductions(ElementCount VF) const {
1525fe6060f1SDimitry Andric return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1526fe6060f1SDimitry Andric const RecurrenceDescriptor &RdxDesc = Reduction.second;
1527fe6060f1SDimitry Andric return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1528fe6060f1SDimitry Andric }));
15290b57cec5SDimitry Andric }
15300b57cec5SDimitry Andric
1531bdd1243dSDimitry Andric /// Given costs for both strategies, return true if the scalar predication
1532bdd1243dSDimitry Andric /// lowering should be used for div/rem. This incorporates an override
1533bdd1243dSDimitry Andric /// option so it is not simply a cost comparison.
isDivRemScalarWithPredication(InstructionCost ScalarCost,InstructionCost SafeDivisorCost) const1534bdd1243dSDimitry Andric bool isDivRemScalarWithPredication(InstructionCost ScalarCost,
1535bdd1243dSDimitry Andric InstructionCost SafeDivisorCost) const {
1536bdd1243dSDimitry Andric switch (ForceSafeDivisor) {
1537bdd1243dSDimitry Andric case cl::BOU_UNSET:
1538bdd1243dSDimitry Andric return ScalarCost < SafeDivisorCost;
1539bdd1243dSDimitry Andric case cl::BOU_TRUE:
1540bdd1243dSDimitry Andric return false;
1541bdd1243dSDimitry Andric case cl::BOU_FALSE:
1542bdd1243dSDimitry Andric return true;
1543bdd1243dSDimitry Andric };
1544bdd1243dSDimitry Andric llvm_unreachable("impossible case value");
1545bdd1243dSDimitry Andric }
1546bdd1243dSDimitry Andric
1547bdd1243dSDimitry Andric /// Returns true if \p I is an instruction which requires predication and
1548bdd1243dSDimitry Andric /// for which our chosen predication strategy is scalarization (i.e. we
1549bdd1243dSDimitry Andric /// don't have an alternate strategy such as masking available).
1550bdd1243dSDimitry Andric /// \p VF is the vectorization factor that will be used to vectorize \p I.
155104eeddc0SDimitry Andric bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
15520b57cec5SDimitry Andric
1553bdd1243dSDimitry Andric /// Returns true if \p I is an instruction that needs to be predicated
1554bdd1243dSDimitry Andric /// at runtime. The result is independent of the predication mechanism.
1555bdd1243dSDimitry Andric /// Superset of instructions that return true for isScalarWithPredication.
1556bdd1243dSDimitry Andric bool isPredicatedInst(Instruction *I) const;
1557bdd1243dSDimitry Andric
1558bdd1243dSDimitry Andric /// Return the costs for our two available strategies for lowering a
1559bdd1243dSDimitry Andric /// div/rem operation which requires speculating at least one lane.
1560bdd1243dSDimitry Andric /// First result is for scalarization (will be invalid for scalable
1561bdd1243dSDimitry Andric /// vectors); second is for the safe-divisor strategy.
1562bdd1243dSDimitry Andric std::pair<InstructionCost, InstructionCost>
1563bdd1243dSDimitry Andric getDivRemSpeculationCost(Instruction *I,
1564bdd1243dSDimitry Andric ElementCount VF) const;
15650b57cec5SDimitry Andric
15660b57cec5SDimitry Andric /// Returns true if \p I is a memory instruction with consecutive memory
15670b57cec5SDimitry Andric /// access that can be widened.
1568bdd1243dSDimitry Andric bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
15690b57cec5SDimitry Andric
15700b57cec5SDimitry Andric /// Returns true if \p I is a memory instruction in an interleaved-group
15710b57cec5SDimitry Andric /// of memory accesses that can be vectorized with wide vector loads/stores
15720b57cec5SDimitry Andric /// and shuffles.
1573bdd1243dSDimitry Andric bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF);
15740b57cec5SDimitry Andric
15750b57cec5SDimitry Andric /// Check if \p Instr belongs to any interleaved access group.
isAccessInterleaved(Instruction * Instr)15760b57cec5SDimitry Andric bool isAccessInterleaved(Instruction *Instr) {
15770b57cec5SDimitry Andric return InterleaveInfo.isInterleaved(Instr);
15780b57cec5SDimitry Andric }
15790b57cec5SDimitry Andric
15800b57cec5SDimitry Andric /// Get the interleaved access group that \p Instr belongs to.
15810b57cec5SDimitry Andric const InterleaveGroup<Instruction> *
getInterleavedAccessGroup(Instruction * Instr)15820b57cec5SDimitry Andric getInterleavedAccessGroup(Instruction *Instr) {
15830b57cec5SDimitry Andric return InterleaveInfo.getInterleaveGroup(Instr);
15840b57cec5SDimitry Andric }
15850b57cec5SDimitry Andric
1586e8d8bef9SDimitry Andric /// Returns true if we're required to use a scalar epilogue for at least
1587e8d8bef9SDimitry Andric /// the final iteration of the original loop.
requiresScalarEpilogue(bool IsVectorizing) const1588fe013be4SDimitry Andric bool requiresScalarEpilogue(bool IsVectorizing) const {
1589e8d8bef9SDimitry Andric if (!isScalarEpilogueAllowed())
1590e8d8bef9SDimitry Andric return false;
1591e8d8bef9SDimitry Andric // If we might exit from anywhere but the latch, must run the exiting
1592e8d8bef9SDimitry Andric // iteration in scalar form.
1593e8d8bef9SDimitry Andric if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1594e8d8bef9SDimitry Andric return true;
1595fe013be4SDimitry Andric return IsVectorizing && InterleaveInfo.requiresScalarEpilogue();
1596fe013be4SDimitry Andric }
1597fe013be4SDimitry Andric
1598fe013be4SDimitry Andric /// Returns true if we're required to use a scalar epilogue for at least
1599fe013be4SDimitry Andric /// the final iteration of the original loop for all VFs in \p Range.
1600fe013be4SDimitry Andric /// A scalar epilogue must either be required for all VFs in \p Range or for
1601fe013be4SDimitry Andric /// none.
requiresScalarEpilogue(VFRange Range) const1602fe013be4SDimitry Andric bool requiresScalarEpilogue(VFRange Range) const {
1603fe013be4SDimitry Andric auto RequiresScalarEpilogue = [this](ElementCount VF) {
1604fe013be4SDimitry Andric return requiresScalarEpilogue(VF.isVector());
1605fe013be4SDimitry Andric };
1606fe013be4SDimitry Andric bool IsRequired = all_of(Range, RequiresScalarEpilogue);
1607fe013be4SDimitry Andric assert(
1608fe013be4SDimitry Andric (IsRequired || none_of(Range, RequiresScalarEpilogue)) &&
1609fe013be4SDimitry Andric "all VFs in range must agree on whether a scalar epilogue is required");
1610fe013be4SDimitry Andric return IsRequired;
16110b57cec5SDimitry Andric }
16120b57cec5SDimitry Andric
16138bcb0991SDimitry Andric /// Returns true if a scalar epilogue is not allowed due to optsize or a
16148bcb0991SDimitry Andric /// loop hint annotation.
isScalarEpilogueAllowed() const16158bcb0991SDimitry Andric bool isScalarEpilogueAllowed() const {
16168bcb0991SDimitry Andric return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
16178bcb0991SDimitry Andric }
16180b57cec5SDimitry Andric
1619fe013be4SDimitry Andric /// Returns the TailFoldingStyle that is best for the current loop.
1620fe013be4SDimitry Andric TailFoldingStyle
getTailFoldingStyle(bool IVUpdateMayOverflow=true) const1621fe013be4SDimitry Andric getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1622fe013be4SDimitry Andric if (!CanFoldTailByMasking)
1623fe013be4SDimitry Andric return TailFoldingStyle::None;
16240b57cec5SDimitry Andric
1625fe013be4SDimitry Andric if (ForceTailFoldingStyle.getNumOccurrences())
1626fe013be4SDimitry Andric return ForceTailFoldingStyle;
1627fe013be4SDimitry Andric
1628fe013be4SDimitry Andric return TTI.getPreferredTailFoldingStyle(IVUpdateMayOverflow);
1629fe013be4SDimitry Andric }
1630fe013be4SDimitry Andric
1631fe013be4SDimitry Andric /// Returns true if all loop blocks should be masked to fold tail loop.
foldTailByMasking() const1632fe013be4SDimitry Andric bool foldTailByMasking() const {
1633fe013be4SDimitry Andric return getTailFoldingStyle() != TailFoldingStyle::None;
1634753f127fSDimitry Andric }
1635753f127fSDimitry Andric
1636349cc55cSDimitry Andric /// Returns true if the instructions in this block requires predication
1637349cc55cSDimitry Andric /// for any reason, e.g. because tail folding now requires a predicate
1638349cc55cSDimitry Andric /// or because the block in the original loop was predicated.
blockNeedsPredicationForAnyReason(BasicBlock * BB) const1639349cc55cSDimitry Andric bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
16400b57cec5SDimitry Andric return foldTailByMasking() || Legal->blockNeedsPredication(BB);
16410b57cec5SDimitry Andric }
16420b57cec5SDimitry Andric
1643e8d8bef9SDimitry Andric /// Returns true if the Phi is part of an inloop reduction.
isInLoopReduction(PHINode * Phi) const1644e8d8bef9SDimitry Andric bool isInLoopReduction(PHINode *Phi) const {
1645c9157d92SDimitry Andric return InLoopReductions.contains(Phi);
1646e8d8bef9SDimitry Andric }
1647e8d8bef9SDimitry Andric
16480b57cec5SDimitry Andric /// Estimate cost of an intrinsic call instruction CI if it were vectorized
16490b57cec5SDimitry Andric /// with factor VF. Return the cost of the instruction, including
16500b57cec5SDimitry Andric /// scalarization overhead if it's needed.
1651fe6060f1SDimitry Andric InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
16520b57cec5SDimitry Andric
16530b57cec5SDimitry Andric /// Estimate cost of a call instruction CI if it were vectorized with factor
16540b57cec5SDimitry Andric /// VF. Return the cost of the instruction, including scalarization overhead
1655c9157d92SDimitry Andric /// if it's needed.
1656c9157d92SDimitry Andric InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const;
16570b57cec5SDimitry Andric
16585ffd83dbSDimitry Andric /// Invalidates decisions already taken by the cost model.
invalidateCostModelingDecisions()16595ffd83dbSDimitry Andric void invalidateCostModelingDecisions() {
16605ffd83dbSDimitry Andric WideningDecisions.clear();
1661c9157d92SDimitry Andric CallWideningDecisions.clear();
16625ffd83dbSDimitry Andric Uniforms.clear();
16635ffd83dbSDimitry Andric Scalars.clear();
16645ffd83dbSDimitry Andric }
16655ffd83dbSDimitry Andric
1666fe013be4SDimitry Andric /// The vectorization cost is a combination of the cost itself and a boolean
1667fe013be4SDimitry Andric /// indicating whether any of the contributing operations will actually
1668fe013be4SDimitry Andric /// operate on vector values after type legalization in the backend. If this
1669fe013be4SDimitry Andric /// latter value is false, then all operations will be scalarized (i.e. no
1670fe013be4SDimitry Andric /// vectorization has actually taken place).
1671fe013be4SDimitry Andric using VectorizationCostTy = std::pair<InstructionCost, bool>;
1672fe013be4SDimitry Andric
1673fe013be4SDimitry Andric /// Returns the expected execution cost. The unit of the cost does
1674fe013be4SDimitry Andric /// not matter because we use the 'cost' units to compare different
1675fe013be4SDimitry Andric /// vector widths. The cost that is returned is *not* normalized by
1676fe013be4SDimitry Andric /// the factor width. If \p Invalid is not nullptr, this function
1677fe013be4SDimitry Andric /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1678fe013be4SDimitry Andric /// each instruction that has an Invalid cost for the given VF.
1679fe013be4SDimitry Andric VectorizationCostTy
1680fe013be4SDimitry Andric expectedCost(ElementCount VF,
1681fe013be4SDimitry Andric SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
1682fe013be4SDimitry Andric
hasPredStores() const1683fe013be4SDimitry Andric bool hasPredStores() const { return NumPredStores > 0; }
1684fe013be4SDimitry Andric
1685fe013be4SDimitry Andric /// Returns true if epilogue vectorization is considered profitable, and
1686fe013be4SDimitry Andric /// false otherwise.
1687fe013be4SDimitry Andric /// \p VF is the vectorization factor chosen for the original loop.
1688fe013be4SDimitry Andric bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1689d56accc7SDimitry Andric
1690753f127fSDimitry Andric private:
1691753f127fSDimitry Andric unsigned NumPredStores = 0;
1692753f127fSDimitry Andric
1693fe6060f1SDimitry Andric /// \return An upper bound for the vectorization factors for both
1694fe6060f1SDimitry Andric /// fixed and scalable vectorization, where the minimum-known number of
1695fe6060f1SDimitry Andric /// elements is a power-of-2 larger than zero. If scalable vectorization is
1696fe6060f1SDimitry Andric /// disabled or unsupported, then the scalable part will be equal to
1697fe6060f1SDimitry Andric /// ElementCount::getScalable(0).
1698c9157d92SDimitry Andric FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
16990eae32dcSDimitry Andric ElementCount UserVF,
17000eae32dcSDimitry Andric bool FoldTailByMasking);
17010b57cec5SDimitry Andric
1702fe6060f1SDimitry Andric /// \return the maximized element count based on the targets vector
1703fe6060f1SDimitry Andric /// registers and the loop trip-count, but limited to a maximum safe VF.
1704fe6060f1SDimitry Andric /// This is a helper function of computeFeasibleMaxVF.
1705c9157d92SDimitry Andric ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1706fe6060f1SDimitry Andric unsigned SmallestType,
1707fe6060f1SDimitry Andric unsigned WidestType,
170881ad6265SDimitry Andric ElementCount MaxSafeVF,
17090eae32dcSDimitry Andric bool FoldTailByMasking);
1710fe6060f1SDimitry Andric
1711fe6060f1SDimitry Andric /// \return the maximum legal scalable VF, based on the safe max number
1712fe6060f1SDimitry Andric /// of elements.
1713fe6060f1SDimitry Andric ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1714fe6060f1SDimitry Andric
17150b57cec5SDimitry Andric /// Returns the execution time cost of an instruction for a given vector
17160b57cec5SDimitry Andric /// width. Vector width of one means scalar.
1717e8d8bef9SDimitry Andric VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
17180b57cec5SDimitry Andric
17190b57cec5SDimitry Andric /// The cost-computation logic from getInstructionCost which provides
17200b57cec5SDimitry Andric /// the vector type as an output parameter.
1721e8d8bef9SDimitry Andric InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1722e8d8bef9SDimitry Andric Type *&VectorTy);
1723e8d8bef9SDimitry Andric
1724e8d8bef9SDimitry Andric /// Return the cost of instructions in an inloop reduction pattern, if I is
1725e8d8bef9SDimitry Andric /// part of that pattern.
1726bdd1243dSDimitry Andric std::optional<InstructionCost>
1727fe6060f1SDimitry Andric getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1728c9157d92SDimitry Andric TTI::TargetCostKind CostKind) const;
17290b57cec5SDimitry Andric
17300b57cec5SDimitry Andric /// Calculate vectorization cost of memory instruction \p I.
1731e8d8bef9SDimitry Andric InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
17320b57cec5SDimitry Andric
17330b57cec5SDimitry Andric /// The cost computation for scalarized memory instruction.
1734e8d8bef9SDimitry Andric InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
17350b57cec5SDimitry Andric
17360b57cec5SDimitry Andric /// The cost computation for interleaving group of memory instructions.
1737e8d8bef9SDimitry Andric InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
17380b57cec5SDimitry Andric
17390b57cec5SDimitry Andric /// The cost computation for Gather/Scatter instruction.
1740e8d8bef9SDimitry Andric InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
17410b57cec5SDimitry Andric
17420b57cec5SDimitry Andric /// The cost computation for widening instruction \p I with consecutive
17430b57cec5SDimitry Andric /// memory access.
1744e8d8bef9SDimitry Andric InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
17450b57cec5SDimitry Andric
17460b57cec5SDimitry Andric /// The cost calculation for Load/Store instruction \p I with uniform pointer -
17470b57cec5SDimitry Andric /// Load: scalar load + broadcast.
17480b57cec5SDimitry Andric /// Store: scalar store + (loop invariant value stored? 0 : extract of last
17490b57cec5SDimitry Andric /// element)
1750e8d8bef9SDimitry Andric InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
17510b57cec5SDimitry Andric
17520b57cec5SDimitry Andric /// Estimate the overhead of scalarizing an instruction. This is a
17530b57cec5SDimitry Andric /// convenience wrapper for the type-based getScalarizationOverhead API.
1754bdd1243dSDimitry Andric InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF,
1755bdd1243dSDimitry Andric TTI::TargetCostKind CostKind) const;
17560b57cec5SDimitry Andric
17570b57cec5SDimitry Andric /// Returns true if an artificially high cost for emulated masked memrefs
17580b57cec5SDimitry Andric /// should be used.
175904eeddc0SDimitry Andric bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
17600b57cec5SDimitry Andric
17610b57cec5SDimitry Andric /// Map of scalar integer values to the smallest bitwidth they can be legally
17620b57cec5SDimitry Andric /// represented as. The vector equivalents of these values should be truncated
17630b57cec5SDimitry Andric /// to this type.
17640b57cec5SDimitry Andric MapVector<Instruction *, uint64_t> MinBWs;
17650b57cec5SDimitry Andric
17660b57cec5SDimitry Andric /// A type representing the costs for instructions if they were to be
17670b57cec5SDimitry Andric /// scalarized rather than vectorized. The entries are Instruction-Cost
17680b57cec5SDimitry Andric /// pairs.
1769e8d8bef9SDimitry Andric using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
17700b57cec5SDimitry Andric
17710b57cec5SDimitry Andric /// A set containing all BasicBlocks that are known to present after
17720b57cec5SDimitry Andric /// vectorization as a predicated block.
1773753f127fSDimitry Andric DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
1774753f127fSDimitry Andric PredicatedBBsAfterVectorization;
17750b57cec5SDimitry Andric
17760b57cec5SDimitry Andric /// Records whether it is allowed to have the original scalar loop execute at
17770b57cec5SDimitry Andric /// least once. This may be needed as a fallback loop in case runtime
17780b57cec5SDimitry Andric /// aliasing/dependence checks fail, or to handle the tail/remainder
17790b57cec5SDimitry Andric /// iterations when the trip count is unknown or doesn't divide by the VF,
17800b57cec5SDimitry Andric /// or as a peel-loop to handle gaps in interleave-groups.
17810b57cec5SDimitry Andric /// Under optsize and when the trip count is very small we don't allow any
17820b57cec5SDimitry Andric /// iterations to execute in the scalar loop.
17838bcb0991SDimitry Andric ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
17840b57cec5SDimitry Andric
17850b57cec5SDimitry Andric /// All blocks of loop are to be masked to fold tail of scalar iterations.
1786fe013be4SDimitry Andric bool CanFoldTailByMasking = false;
17870b57cec5SDimitry Andric
17880b57cec5SDimitry Andric /// A map holding scalar costs for different vectorization factors. The
17890b57cec5SDimitry Andric /// presence of a cost for an instruction in the mapping indicates that the
17900b57cec5SDimitry Andric /// instruction will be scalarized when vectorizing with the associated
17910b57cec5SDimitry Andric /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1792e8d8bef9SDimitry Andric DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
17930b57cec5SDimitry Andric
17940b57cec5SDimitry Andric /// Holds the instructions known to be uniform after vectorization.
17950b57cec5SDimitry Andric /// The data is collected per VF.
1796e8d8bef9SDimitry Andric DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
17970b57cec5SDimitry Andric
17980b57cec5SDimitry Andric /// Holds the instructions known to be scalar after vectorization.
17990b57cec5SDimitry Andric /// The data is collected per VF.
1800e8d8bef9SDimitry Andric DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
18010b57cec5SDimitry Andric
18020b57cec5SDimitry Andric /// Holds the instructions (address computations) that are forced to be
18030b57cec5SDimitry Andric /// scalarized.
1804e8d8bef9SDimitry Andric DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1805e8d8bef9SDimitry Andric
1806c9157d92SDimitry Andric /// PHINodes of the reductions that should be expanded in-loop.
1807c9157d92SDimitry Andric SmallPtrSet<PHINode *, 4> InLoopReductions;
1808e8d8bef9SDimitry Andric
1809e8d8bef9SDimitry Andric /// A Map of inloop reduction operations and their immediate chain operand.
1810e8d8bef9SDimitry Andric /// FIXME: This can be removed once reductions can be costed correctly in
1811c9157d92SDimitry Andric /// VPlan. This was added to allow quick lookup of the inloop operations.
1812e8d8bef9SDimitry Andric DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
18130b57cec5SDimitry Andric
18140b57cec5SDimitry Andric /// Returns the expected difference in cost from scalarizing the expression
18150b57cec5SDimitry Andric /// feeding a predicated instruction \p PredInst. The instructions to
18160b57cec5SDimitry Andric /// scalarize and their scalar costs are collected in \p ScalarCosts. A
18170b57cec5SDimitry Andric /// non-negative return value implies the expression will be scalarized.
18180b57cec5SDimitry Andric /// Currently, only single-use chains are considered for scalarization.
1819bdd1243dSDimitry Andric InstructionCost computePredInstDiscount(Instruction *PredInst,
1820bdd1243dSDimitry Andric ScalarCostsTy &ScalarCosts,
1821e8d8bef9SDimitry Andric ElementCount VF);
18220b57cec5SDimitry Andric
18230b57cec5SDimitry Andric /// Collect the instructions that are uniform after vectorization. An
18240b57cec5SDimitry Andric /// instruction is uniform if we represent it with a single scalar value in
18250b57cec5SDimitry Andric /// the vectorized loop corresponding to each vector iteration. Examples of
18260b57cec5SDimitry Andric /// uniform instructions include pointer operands of consecutive or
18270b57cec5SDimitry Andric /// interleaved memory accesses. Note that although uniformity implies an
18280b57cec5SDimitry Andric /// instruction will be scalar, the reverse is not true. In general, a
18290b57cec5SDimitry Andric /// scalarized instruction will be represented by VF scalar values in the
18300b57cec5SDimitry Andric /// vectorized loop, each corresponding to an iteration of the original
18310b57cec5SDimitry Andric /// scalar loop.
1832e8d8bef9SDimitry Andric void collectLoopUniforms(ElementCount VF);
18330b57cec5SDimitry Andric
18340b57cec5SDimitry Andric /// Collect the instructions that are scalar after vectorization. An
18350b57cec5SDimitry Andric /// instruction is scalar if it is known to be uniform or will be scalarized
18364824e7fdSDimitry Andric /// during vectorization. collectLoopScalars should only add non-uniform nodes
18374824e7fdSDimitry Andric /// to the list if they are used by a load/store instruction that is marked as
18384824e7fdSDimitry Andric /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
18394824e7fdSDimitry Andric /// VF values in the vectorized loop, each corresponding to an iteration of
18404824e7fdSDimitry Andric /// the original scalar loop.
1841e8d8bef9SDimitry Andric void collectLoopScalars(ElementCount VF);
18420b57cec5SDimitry Andric
18430b57cec5SDimitry Andric /// Keeps cost model vectorization decision and cost for instructions.
18440b57cec5SDimitry Andric /// Right now it is used for memory instructions only.
1845e8d8bef9SDimitry Andric using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1846e8d8bef9SDimitry Andric std::pair<InstWidening, InstructionCost>>;
18470b57cec5SDimitry Andric
18480b57cec5SDimitry Andric DecisionList WideningDecisions;
18490b57cec5SDimitry Andric
1850c9157d92SDimitry Andric using CallDecisionList =
1851c9157d92SDimitry Andric DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1852c9157d92SDimitry Andric
1853c9157d92SDimitry Andric CallDecisionList CallWideningDecisions;
1854c9157d92SDimitry Andric
18550b57cec5SDimitry Andric /// Returns true if \p V is expected to be vectorized and it needs to be
18560b57cec5SDimitry Andric /// extracted.
needsExtract(Value * V,ElementCount VF) const1857e8d8bef9SDimitry Andric bool needsExtract(Value *V, ElementCount VF) const {
18580b57cec5SDimitry Andric Instruction *I = dyn_cast<Instruction>(V);
1859e8d8bef9SDimitry Andric if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1860e8d8bef9SDimitry Andric TheLoop->isLoopInvariant(I))
18610b57cec5SDimitry Andric return false;
18620b57cec5SDimitry Andric
18630b57cec5SDimitry Andric // Assume we can vectorize V (and hence we need extraction) if the
18640b57cec5SDimitry Andric // scalars are not computed yet. This can happen, because it is called
18650b57cec5SDimitry Andric // via getScalarizationOverhead from setCostBasedWideningDecision, before
18660b57cec5SDimitry Andric // the scalars are collected. That should be a safe assumption in most
18670b57cec5SDimitry Andric // cases, because we check if the operands have vectorizable types
18680b57cec5SDimitry Andric // beforehand in LoopVectorizationLegality.
1869fe013be4SDimitry Andric return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
18700b57cec5SDimitry Andric };
18710b57cec5SDimitry Andric
18720b57cec5SDimitry Andric /// Returns a range containing only operands needing to be extracted.
filterExtractingOperands(Instruction::op_range Ops,ElementCount VF) const18730b57cec5SDimitry Andric SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1874fe6060f1SDimitry Andric ElementCount VF) const {
18750b57cec5SDimitry Andric return SmallVector<Value *, 4>(make_filter_range(
18760b57cec5SDimitry Andric Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
18770b57cec5SDimitry Andric }
18780b57cec5SDimitry Andric
18790b57cec5SDimitry Andric public:
18800b57cec5SDimitry Andric /// The loop that we evaluate.
18810b57cec5SDimitry Andric Loop *TheLoop;
18820b57cec5SDimitry Andric
18830b57cec5SDimitry Andric /// Predicated scalar evolution analysis.
18840b57cec5SDimitry Andric PredicatedScalarEvolution &PSE;
18850b57cec5SDimitry Andric
18860b57cec5SDimitry Andric /// Loop Info analysis.
18870b57cec5SDimitry Andric LoopInfo *LI;
18880b57cec5SDimitry Andric
18890b57cec5SDimitry Andric /// Vectorization legality.
18900b57cec5SDimitry Andric LoopVectorizationLegality *Legal;
18910b57cec5SDimitry Andric
18920b57cec5SDimitry Andric /// Vector target information.
18930b57cec5SDimitry Andric const TargetTransformInfo &TTI;
18940b57cec5SDimitry Andric
18950b57cec5SDimitry Andric /// Target Library Info.
18960b57cec5SDimitry Andric const TargetLibraryInfo *TLI;
18970b57cec5SDimitry Andric
18980b57cec5SDimitry Andric /// Demanded bits analysis.
18990b57cec5SDimitry Andric DemandedBits *DB;
19000b57cec5SDimitry Andric
19010b57cec5SDimitry Andric /// Assumption cache.
19020b57cec5SDimitry Andric AssumptionCache *AC;
19030b57cec5SDimitry Andric
19040b57cec5SDimitry Andric /// Interface to emit optimization remarks.
19050b57cec5SDimitry Andric OptimizationRemarkEmitter *ORE;
19060b57cec5SDimitry Andric
19070b57cec5SDimitry Andric const Function *TheFunction;
19080b57cec5SDimitry Andric
19090b57cec5SDimitry Andric /// Loop Vectorize Hint.
19100b57cec5SDimitry Andric const LoopVectorizeHints *Hints;
19110b57cec5SDimitry Andric
19120b57cec5SDimitry Andric /// The interleave access information contains groups of interleaved accesses
19130b57cec5SDimitry Andric /// with the same stride and close to each other.
19140b57cec5SDimitry Andric InterleavedAccessInfo &InterleaveInfo;
19150b57cec5SDimitry Andric
19160b57cec5SDimitry Andric /// Values to ignore in the cost model.
19170b57cec5SDimitry Andric SmallPtrSet<const Value *, 16> ValuesToIgnore;
19180b57cec5SDimitry Andric
19190b57cec5SDimitry Andric /// Values to ignore in the cost model when VF > 1.
19200b57cec5SDimitry Andric SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1921e8d8bef9SDimitry Andric
1922fe6060f1SDimitry Andric /// All element types found in the loop.
1923fe6060f1SDimitry Andric SmallPtrSet<Type *, 16> ElementTypesInLoop;
19240b57cec5SDimitry Andric };
19250b57cec5SDimitry Andric } // end namespace llvm
19260b57cec5SDimitry Andric
1927bdd1243dSDimitry Andric namespace {
1928fe6060f1SDimitry Andric /// Helper struct to manage generating runtime checks for vectorization.
1929fe6060f1SDimitry Andric ///
1930fe6060f1SDimitry Andric /// The runtime checks are created up-front in temporary blocks to allow better
1931fe6060f1SDimitry Andric /// estimating the cost and un-linked from the existing IR. After deciding to
1932fe6060f1SDimitry Andric /// vectorize, the checks are moved back. If deciding not to vectorize, the
1933fe6060f1SDimitry Andric /// temporary blocks are completely removed.
1934fe6060f1SDimitry Andric class GeneratedRTChecks {
1935fe6060f1SDimitry Andric /// Basic block which contains the generated SCEV checks, if any.
1936fe6060f1SDimitry Andric BasicBlock *SCEVCheckBlock = nullptr;
1937fe6060f1SDimitry Andric
1938fe6060f1SDimitry Andric /// The value representing the result of the generated SCEV checks. If it is
1939fe6060f1SDimitry Andric /// nullptr, either no SCEV checks have been generated or they have been used.
1940fe6060f1SDimitry Andric Value *SCEVCheckCond = nullptr;
1941fe6060f1SDimitry Andric
1942fe6060f1SDimitry Andric /// Basic block which contains the generated memory runtime checks, if any.
1943fe6060f1SDimitry Andric BasicBlock *MemCheckBlock = nullptr;
1944fe6060f1SDimitry Andric
1945fe6060f1SDimitry Andric /// The value representing the result of the generated memory runtime checks.
1946fe6060f1SDimitry Andric /// If it is nullptr, either no memory runtime checks have been generated or
1947fe6060f1SDimitry Andric /// they have been used.
1948349cc55cSDimitry Andric Value *MemRuntimeCheckCond = nullptr;
1949fe6060f1SDimitry Andric
1950fe6060f1SDimitry Andric DominatorTree *DT;
1951fe6060f1SDimitry Andric LoopInfo *LI;
1952753f127fSDimitry Andric TargetTransformInfo *TTI;
1953fe6060f1SDimitry Andric
1954fe6060f1SDimitry Andric SCEVExpander SCEVExp;
1955fe6060f1SDimitry Andric SCEVExpander MemCheckExp;
1956fe6060f1SDimitry Andric
1957753f127fSDimitry Andric bool CostTooHigh = false;
1958c9157d92SDimitry Andric const bool AddBranchWeights;
1959753f127fSDimitry Andric
1960*b9d9368bSDimitry Andric Loop *OuterLoop = nullptr;
1961*b9d9368bSDimitry Andric
1962fe6060f1SDimitry Andric public:
GeneratedRTChecks(ScalarEvolution & SE,DominatorTree * DT,LoopInfo * LI,TargetTransformInfo * TTI,const DataLayout & DL,bool AddBranchWeights)1963fe6060f1SDimitry Andric GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1964c9157d92SDimitry Andric TargetTransformInfo *TTI, const DataLayout &DL,
1965c9157d92SDimitry Andric bool AddBranchWeights)
1966753f127fSDimitry Andric : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),
1967c9157d92SDimitry Andric MemCheckExp(SE, DL, "scev.check"), AddBranchWeights(AddBranchWeights) {}
1968fe6060f1SDimitry Andric
1969fe6060f1SDimitry Andric /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1970fe6060f1SDimitry Andric /// accurately estimate the cost of the runtime checks. The blocks are
1971fe6060f1SDimitry Andric /// un-linked from the IR and is added back during vector code generation. If
1972fe6060f1SDimitry Andric /// there is no vector code generation, the check blocks are removed
1973fe6060f1SDimitry Andric /// completely.
Create(Loop * L,const LoopAccessInfo & LAI,const SCEVPredicate & UnionPred,ElementCount VF,unsigned IC)1974fe6060f1SDimitry Andric void Create(Loop *L, const LoopAccessInfo &LAI,
197581ad6265SDimitry Andric const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1976fe6060f1SDimitry Andric
1977753f127fSDimitry Andric // Hard cutoff to limit compile-time increase in case a very large number of
1978753f127fSDimitry Andric // runtime checks needs to be generated.
1979753f127fSDimitry Andric // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1980753f127fSDimitry Andric // profile info.
1981753f127fSDimitry Andric CostTooHigh =
1982753f127fSDimitry Andric LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
1983753f127fSDimitry Andric if (CostTooHigh)
1984753f127fSDimitry Andric return;
1985753f127fSDimitry Andric
1986fe6060f1SDimitry Andric BasicBlock *LoopHeader = L->getHeader();
1987fe6060f1SDimitry Andric BasicBlock *Preheader = L->getLoopPreheader();
1988fe6060f1SDimitry Andric
1989fe6060f1SDimitry Andric // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1990fe6060f1SDimitry Andric // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1991fe6060f1SDimitry Andric // may be used by SCEVExpander. The blocks will be un-linked from their
1992fe6060f1SDimitry Andric // predecessors and removed from LI & DT at the end of the function.
1993fe6060f1SDimitry Andric if (!UnionPred.isAlwaysTrue()) {
1994fe6060f1SDimitry Andric SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1995fe6060f1SDimitry Andric nullptr, "vector.scevcheck");
1996fe6060f1SDimitry Andric
1997fe6060f1SDimitry Andric SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1998fe6060f1SDimitry Andric &UnionPred, SCEVCheckBlock->getTerminator());
1999fe6060f1SDimitry Andric }
2000fe6060f1SDimitry Andric
2001fe6060f1SDimitry Andric const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
2002fe6060f1SDimitry Andric if (RtPtrChecking.Need) {
2003fe6060f1SDimitry Andric auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
2004fe6060f1SDimitry Andric MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
2005fe6060f1SDimitry Andric "vector.memcheck");
2006fe6060f1SDimitry Andric
200781ad6265SDimitry Andric auto DiffChecks = RtPtrChecking.getDiffChecks();
200881ad6265SDimitry Andric if (DiffChecks) {
2009fcaf7f86SDimitry Andric Value *RuntimeVF = nullptr;
201081ad6265SDimitry Andric MemRuntimeCheckCond = addDiffRuntimeChecks(
2011bdd1243dSDimitry Andric MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
2012fcaf7f86SDimitry Andric [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
2013fcaf7f86SDimitry Andric if (!RuntimeVF)
2014fcaf7f86SDimitry Andric RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
2015fcaf7f86SDimitry Andric return RuntimeVF;
201681ad6265SDimitry Andric },
201781ad6265SDimitry Andric IC);
201881ad6265SDimitry Andric } else {
2019c9157d92SDimitry Andric MemRuntimeCheckCond = addRuntimeChecks(
2020c9157d92SDimitry Andric MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
2021c9157d92SDimitry Andric MemCheckExp, VectorizerParams::HoistRuntimeChecks);
202281ad6265SDimitry Andric }
2023fe6060f1SDimitry Andric assert(MemRuntimeCheckCond &&
2024fe6060f1SDimitry Andric "no RT checks generated although RtPtrChecking "
2025fe6060f1SDimitry Andric "claimed checks are required");
2026fe6060f1SDimitry Andric }
2027fe6060f1SDimitry Andric
2028fe6060f1SDimitry Andric if (!MemCheckBlock && !SCEVCheckBlock)
2029fe6060f1SDimitry Andric return;
2030fe6060f1SDimitry Andric
2031fe6060f1SDimitry Andric // Unhook the temporary block with the checks, update various places
2032fe6060f1SDimitry Andric // accordingly.
2033fe6060f1SDimitry Andric if (SCEVCheckBlock)
2034fe6060f1SDimitry Andric SCEVCheckBlock->replaceAllUsesWith(Preheader);
2035fe6060f1SDimitry Andric if (MemCheckBlock)
2036fe6060f1SDimitry Andric MemCheckBlock->replaceAllUsesWith(Preheader);
2037fe6060f1SDimitry Andric
2038fe6060f1SDimitry Andric if (SCEVCheckBlock) {
2039fe6060f1SDimitry Andric SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2040fe6060f1SDimitry Andric new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
2041fe6060f1SDimitry Andric Preheader->getTerminator()->eraseFromParent();
2042fe6060f1SDimitry Andric }
2043fe6060f1SDimitry Andric if (MemCheckBlock) {
2044fe6060f1SDimitry Andric MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2045fe6060f1SDimitry Andric new UnreachableInst(Preheader->getContext(), MemCheckBlock);
2046fe6060f1SDimitry Andric Preheader->getTerminator()->eraseFromParent();
2047fe6060f1SDimitry Andric }
2048fe6060f1SDimitry Andric
2049fe6060f1SDimitry Andric DT->changeImmediateDominator(LoopHeader, Preheader);
2050fe6060f1SDimitry Andric if (MemCheckBlock) {
2051fe6060f1SDimitry Andric DT->eraseNode(MemCheckBlock);
2052fe6060f1SDimitry Andric LI->removeBlock(MemCheckBlock);
2053fe6060f1SDimitry Andric }
2054fe6060f1SDimitry Andric if (SCEVCheckBlock) {
2055fe6060f1SDimitry Andric DT->eraseNode(SCEVCheckBlock);
2056fe6060f1SDimitry Andric LI->removeBlock(SCEVCheckBlock);
2057fe6060f1SDimitry Andric }
2058*b9d9368bSDimitry Andric
2059*b9d9368bSDimitry Andric // Outer loop is used as part of the later cost calculations.
2060*b9d9368bSDimitry Andric OuterLoop = L->getParentLoop();
2061fe6060f1SDimitry Andric }
2062fe6060f1SDimitry Andric
getCost()2063753f127fSDimitry Andric InstructionCost getCost() {
2064753f127fSDimitry Andric if (SCEVCheckBlock || MemCheckBlock)
2065753f127fSDimitry Andric LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
2066753f127fSDimitry Andric
2067753f127fSDimitry Andric if (CostTooHigh) {
2068753f127fSDimitry Andric InstructionCost Cost;
2069753f127fSDimitry Andric Cost.setInvalid();
2070753f127fSDimitry Andric LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n");
2071753f127fSDimitry Andric return Cost;
2072753f127fSDimitry Andric }
2073753f127fSDimitry Andric
2074753f127fSDimitry Andric InstructionCost RTCheckCost = 0;
2075753f127fSDimitry Andric if (SCEVCheckBlock)
2076753f127fSDimitry Andric for (Instruction &I : *SCEVCheckBlock) {
2077753f127fSDimitry Andric if (SCEVCheckBlock->getTerminator() == &I)
2078753f127fSDimitry Andric continue;
2079753f127fSDimitry Andric InstructionCost C =
2080753f127fSDimitry Andric TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
2081753f127fSDimitry Andric LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
2082753f127fSDimitry Andric RTCheckCost += C;
2083753f127fSDimitry Andric }
2084*b9d9368bSDimitry Andric if (MemCheckBlock) {
2085*b9d9368bSDimitry Andric InstructionCost MemCheckCost = 0;
2086753f127fSDimitry Andric for (Instruction &I : *MemCheckBlock) {
2087753f127fSDimitry Andric if (MemCheckBlock->getTerminator() == &I)
2088753f127fSDimitry Andric continue;
2089753f127fSDimitry Andric InstructionCost C =
2090753f127fSDimitry Andric TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
2091753f127fSDimitry Andric LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
2092*b9d9368bSDimitry Andric MemCheckCost += C;
2093*b9d9368bSDimitry Andric }
2094*b9d9368bSDimitry Andric
2095*b9d9368bSDimitry Andric // If the runtime memory checks are being created inside an outer loop
2096*b9d9368bSDimitry Andric // we should find out if these checks are outer loop invariant. If so,
2097*b9d9368bSDimitry Andric // the checks will likely be hoisted out and so the effective cost will
2098*b9d9368bSDimitry Andric // reduce according to the outer loop trip count.
2099*b9d9368bSDimitry Andric if (OuterLoop) {
2100*b9d9368bSDimitry Andric ScalarEvolution *SE = MemCheckExp.getSE();
2101*b9d9368bSDimitry Andric // TODO: If profitable, we could refine this further by analysing every
2102*b9d9368bSDimitry Andric // individual memory check, since there could be a mixture of loop
2103*b9d9368bSDimitry Andric // variant and invariant checks that mean the final condition is
2104*b9d9368bSDimitry Andric // variant.
2105*b9d9368bSDimitry Andric const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
2106*b9d9368bSDimitry Andric if (SE->isLoopInvariant(Cond, OuterLoop)) {
2107*b9d9368bSDimitry Andric // It seems reasonable to assume that we can reduce the effective
2108*b9d9368bSDimitry Andric // cost of the checks even when we know nothing about the trip
2109*b9d9368bSDimitry Andric // count. Assume that the outer loop executes at least twice.
2110*b9d9368bSDimitry Andric unsigned BestTripCount = 2;
2111*b9d9368bSDimitry Andric
2112*b9d9368bSDimitry Andric // If exact trip count is known use that.
2113*b9d9368bSDimitry Andric if (unsigned SmallTC = SE->getSmallConstantTripCount(OuterLoop))
2114*b9d9368bSDimitry Andric BestTripCount = SmallTC;
2115*b9d9368bSDimitry Andric else if (LoopVectorizeWithBlockFrequency) {
2116*b9d9368bSDimitry Andric // Else use profile data if available.
2117*b9d9368bSDimitry Andric if (auto EstimatedTC = getLoopEstimatedTripCount(OuterLoop))
2118*b9d9368bSDimitry Andric BestTripCount = *EstimatedTC;
2119*b9d9368bSDimitry Andric }
2120*b9d9368bSDimitry Andric
2121*b9d9368bSDimitry Andric InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
2122*b9d9368bSDimitry Andric
2123*b9d9368bSDimitry Andric // Let's ensure the cost is always at least 1.
2124*b9d9368bSDimitry Andric NewMemCheckCost = std::max(*NewMemCheckCost.getValue(),
2125*b9d9368bSDimitry Andric (InstructionCost::CostType)1);
2126*b9d9368bSDimitry Andric
2127*b9d9368bSDimitry Andric LLVM_DEBUG(dbgs()
2128*b9d9368bSDimitry Andric << "We expect runtime memory checks to be hoisted "
2129*b9d9368bSDimitry Andric << "out of the outer loop. Cost reduced from "
2130*b9d9368bSDimitry Andric << MemCheckCost << " to " << NewMemCheckCost << '\n');
2131*b9d9368bSDimitry Andric
2132*b9d9368bSDimitry Andric MemCheckCost = NewMemCheckCost;
2133*b9d9368bSDimitry Andric }
2134*b9d9368bSDimitry Andric }
2135*b9d9368bSDimitry Andric
2136*b9d9368bSDimitry Andric RTCheckCost += MemCheckCost;
2137753f127fSDimitry Andric }
2138753f127fSDimitry Andric
2139753f127fSDimitry Andric if (SCEVCheckBlock || MemCheckBlock)
2140753f127fSDimitry Andric LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2141753f127fSDimitry Andric << "\n");
2142753f127fSDimitry Andric
2143753f127fSDimitry Andric return RTCheckCost;
2144753f127fSDimitry Andric }
2145753f127fSDimitry Andric
2146fe6060f1SDimitry Andric /// Remove the created SCEV & memory runtime check blocks & instructions, if
2147fe6060f1SDimitry Andric /// unused.
~GeneratedRTChecks()2148fe6060f1SDimitry Andric ~GeneratedRTChecks() {
214904eeddc0SDimitry Andric SCEVExpanderCleaner SCEVCleaner(SCEVExp);
215004eeddc0SDimitry Andric SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2151fe6060f1SDimitry Andric if (!SCEVCheckCond)
2152fe6060f1SDimitry Andric SCEVCleaner.markResultUsed();
2153fe6060f1SDimitry Andric
2154fe6060f1SDimitry Andric if (!MemRuntimeCheckCond)
2155fe6060f1SDimitry Andric MemCheckCleaner.markResultUsed();
2156fe6060f1SDimitry Andric
2157fe6060f1SDimitry Andric if (MemRuntimeCheckCond) {
2158fe6060f1SDimitry Andric auto &SE = *MemCheckExp.getSE();
2159fe6060f1SDimitry Andric // Memory runtime check generation creates compares that use expanded
2160fe6060f1SDimitry Andric // values. Remove them before running the SCEVExpanderCleaners.
2161fe6060f1SDimitry Andric for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2162fe6060f1SDimitry Andric if (MemCheckExp.isInsertedInstruction(&I))
2163fe6060f1SDimitry Andric continue;
2164fe6060f1SDimitry Andric SE.forgetValue(&I);
2165fe6060f1SDimitry Andric I.eraseFromParent();
2166fe6060f1SDimitry Andric }
2167fe6060f1SDimitry Andric }
2168fe6060f1SDimitry Andric MemCheckCleaner.cleanup();
2169fe6060f1SDimitry Andric SCEVCleaner.cleanup();
2170fe6060f1SDimitry Andric
2171fe6060f1SDimitry Andric if (SCEVCheckCond)
2172fe6060f1SDimitry Andric SCEVCheckBlock->eraseFromParent();
2173fe6060f1SDimitry Andric if (MemRuntimeCheckCond)
2174fe6060f1SDimitry Andric MemCheckBlock->eraseFromParent();
2175fe6060f1SDimitry Andric }
2176fe6060f1SDimitry Andric
2177fe6060f1SDimitry Andric /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2178fe6060f1SDimitry Andric /// adjusts the branches to branch to the vector preheader or \p Bypass,
2179fe6060f1SDimitry Andric /// depending on the generated condition.
emitSCEVChecks(BasicBlock * Bypass,BasicBlock * LoopVectorPreHeader,BasicBlock * LoopExitBlock)218081ad6265SDimitry Andric BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2181fe6060f1SDimitry Andric BasicBlock *LoopVectorPreHeader,
2182fe6060f1SDimitry Andric BasicBlock *LoopExitBlock) {
2183fe6060f1SDimitry Andric if (!SCEVCheckCond)
2184fe6060f1SDimitry Andric return nullptr;
218581ad6265SDimitry Andric
218681ad6265SDimitry Andric Value *Cond = SCEVCheckCond;
218781ad6265SDimitry Andric // Mark the check as used, to prevent it from being removed during cleanup.
218881ad6265SDimitry Andric SCEVCheckCond = nullptr;
218981ad6265SDimitry Andric if (auto *C = dyn_cast<ConstantInt>(Cond))
2190fe6060f1SDimitry Andric if (C->isZero())
2191fe6060f1SDimitry Andric return nullptr;
2192fe6060f1SDimitry Andric
2193fe6060f1SDimitry Andric auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2194fe6060f1SDimitry Andric
2195fe6060f1SDimitry Andric BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2196fe6060f1SDimitry Andric // Create new preheader for vector loop.
2197*b9d9368bSDimitry Andric if (OuterLoop)
2198*b9d9368bSDimitry Andric OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2199fe6060f1SDimitry Andric
2200fe6060f1SDimitry Andric SCEVCheckBlock->getTerminator()->eraseFromParent();
2201fe6060f1SDimitry Andric SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2202fe6060f1SDimitry Andric Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2203fe6060f1SDimitry Andric SCEVCheckBlock);
2204fe6060f1SDimitry Andric
2205fe6060f1SDimitry Andric DT->addNewBlock(SCEVCheckBlock, Pred);
2206fe6060f1SDimitry Andric DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2207fe6060f1SDimitry Andric
2208c9157d92SDimitry Andric BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond);
2209c9157d92SDimitry Andric if (AddBranchWeights)
2210c9157d92SDimitry Andric setBranchWeights(BI, SCEVCheckBypassWeights);
2211c9157d92SDimitry Andric ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI);
2212fe6060f1SDimitry Andric return SCEVCheckBlock;
2213fe6060f1SDimitry Andric }
2214fe6060f1SDimitry Andric
2215fe6060f1SDimitry Andric /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2216fe6060f1SDimitry Andric /// the branches to branch to the vector preheader or \p Bypass, depending on
2217fe6060f1SDimitry Andric /// the generated condition.
emitMemRuntimeChecks(BasicBlock * Bypass,BasicBlock * LoopVectorPreHeader)221881ad6265SDimitry Andric BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2219fe6060f1SDimitry Andric BasicBlock *LoopVectorPreHeader) {
2220fe6060f1SDimitry Andric // Check if we generated code that checks in runtime if arrays overlap.
2221fe6060f1SDimitry Andric if (!MemRuntimeCheckCond)
2222fe6060f1SDimitry Andric return nullptr;
2223fe6060f1SDimitry Andric
2224fe6060f1SDimitry Andric auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2225fe6060f1SDimitry Andric Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2226fe6060f1SDimitry Andric MemCheckBlock);
2227fe6060f1SDimitry Andric
2228fe6060f1SDimitry Andric DT->addNewBlock(MemCheckBlock, Pred);
2229fe6060f1SDimitry Andric DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2230fe6060f1SDimitry Andric MemCheckBlock->moveBefore(LoopVectorPreHeader);
2231fe6060f1SDimitry Andric
2232*b9d9368bSDimitry Andric if (OuterLoop)
2233*b9d9368bSDimitry Andric OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI);
2234fe6060f1SDimitry Andric
2235c9157d92SDimitry Andric BranchInst &BI =
2236c9157d92SDimitry Andric *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
2237c9157d92SDimitry Andric if (AddBranchWeights) {
2238c9157d92SDimitry Andric setBranchWeights(BI, MemCheckBypassWeights);
2239c9157d92SDimitry Andric }
2240c9157d92SDimitry Andric ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
2241fe6060f1SDimitry Andric MemCheckBlock->getTerminator()->setDebugLoc(
2242fe6060f1SDimitry Andric Pred->getTerminator()->getDebugLoc());
2243fe6060f1SDimitry Andric
2244fe6060f1SDimitry Andric // Mark the check as used, to prevent it from being removed during cleanup.
2245fe6060f1SDimitry Andric MemRuntimeCheckCond = nullptr;
2246fe6060f1SDimitry Andric return MemCheckBlock;
2247fe6060f1SDimitry Andric }
2248fe6060f1SDimitry Andric };
2249bdd1243dSDimitry Andric } // namespace
2250fe6060f1SDimitry Andric
useActiveLaneMask(TailFoldingStyle Style)2251fe013be4SDimitry Andric static bool useActiveLaneMask(TailFoldingStyle Style) {
2252fe013be4SDimitry Andric return Style == TailFoldingStyle::Data ||
2253fe013be4SDimitry Andric Style == TailFoldingStyle::DataAndControlFlow ||
2254fe013be4SDimitry Andric Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2255fe013be4SDimitry Andric }
2256fe013be4SDimitry Andric
useActiveLaneMaskForControlFlow(TailFoldingStyle Style)2257fe013be4SDimitry Andric static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
2258fe013be4SDimitry Andric return Style == TailFoldingStyle::DataAndControlFlow ||
2259fe013be4SDimitry Andric Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2260fe013be4SDimitry Andric }
2261fe013be4SDimitry Andric
22620b57cec5SDimitry Andric // Return true if \p OuterLp is an outer loop annotated with hints for explicit
22630b57cec5SDimitry Andric // vectorization. The loop needs to be annotated with #pragma omp simd
22640b57cec5SDimitry Andric // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
22650b57cec5SDimitry Andric // vector length information is not provided, vectorization is not considered
22660b57cec5SDimitry Andric // explicit. Interleave hints are not allowed either. These limitations will be
22670b57cec5SDimitry Andric // relaxed in the future.
22680b57cec5SDimitry Andric // Please, note that we are currently forced to abuse the pragma 'clang
22690b57cec5SDimitry Andric // vectorize' semantics. This pragma provides *auto-vectorization hints*
22700b57cec5SDimitry Andric // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
22710b57cec5SDimitry Andric // provides *explicit vectorization hints* (LV can bypass legal checks and
22720b57cec5SDimitry Andric // assume that vectorization is legal). However, both hints are implemented
22730b57cec5SDimitry Andric // using the same metadata (llvm.loop.vectorize, processed by
22740b57cec5SDimitry Andric // LoopVectorizeHints). This will be fixed in the future when the native IR
22750b57cec5SDimitry Andric // representation for pragma 'omp simd' is introduced.
isExplicitVecOuterLoop(Loop * OuterLp,OptimizationRemarkEmitter * ORE)22760b57cec5SDimitry Andric static bool isExplicitVecOuterLoop(Loop *OuterLp,
22770b57cec5SDimitry Andric OptimizationRemarkEmitter *ORE) {
2278e8d8bef9SDimitry Andric assert(!OuterLp->isInnermost() && "This is not an outer loop");
22790b57cec5SDimitry Andric LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
22800b57cec5SDimitry Andric
22810b57cec5SDimitry Andric // Only outer loops with an explicit vectorization hint are supported.
22820b57cec5SDimitry Andric // Unannotated outer loops are ignored.
22830b57cec5SDimitry Andric if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
22840b57cec5SDimitry Andric return false;
22850b57cec5SDimitry Andric
22860b57cec5SDimitry Andric Function *Fn = OuterLp->getHeader()->getParent();
22870b57cec5SDimitry Andric if (!Hints.allowVectorization(Fn, OuterLp,
22880b57cec5SDimitry Andric true /*VectorizeOnlyWhenForced*/)) {
22890b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
22900b57cec5SDimitry Andric return false;
22910b57cec5SDimitry Andric }
22920b57cec5SDimitry Andric
22930b57cec5SDimitry Andric if (Hints.getInterleave() > 1) {
22940b57cec5SDimitry Andric // TODO: Interleave support is future work.
22950b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
22960b57cec5SDimitry Andric "outer loops.\n");
22970b57cec5SDimitry Andric Hints.emitRemarkWithHints();
22980b57cec5SDimitry Andric return false;
22990b57cec5SDimitry Andric }
23000b57cec5SDimitry Andric
23010b57cec5SDimitry Andric return true;
23020b57cec5SDimitry Andric }
23030b57cec5SDimitry Andric
collectSupportedLoops(Loop & L,LoopInfo * LI,OptimizationRemarkEmitter * ORE,SmallVectorImpl<Loop * > & V)23040b57cec5SDimitry Andric static void collectSupportedLoops(Loop &L, LoopInfo *LI,
23050b57cec5SDimitry Andric OptimizationRemarkEmitter *ORE,
23060b57cec5SDimitry Andric SmallVectorImpl<Loop *> &V) {
23070b57cec5SDimitry Andric // Collect inner loops and outer loops without irreducible control flow. For
23080b57cec5SDimitry Andric // now, only collect outer loops that have explicit vectorization hints. If we
23090b57cec5SDimitry Andric // are stress testing the VPlan H-CFG construction, we collect the outermost
23100b57cec5SDimitry Andric // loop of every loop nest.
2311e8d8bef9SDimitry Andric if (L.isInnermost() || VPlanBuildStressTest ||
23120b57cec5SDimitry Andric (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
23130b57cec5SDimitry Andric LoopBlocksRPO RPOT(&L);
23140b57cec5SDimitry Andric RPOT.perform(LI);
23150b57cec5SDimitry Andric if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
23160b57cec5SDimitry Andric V.push_back(&L);
23170b57cec5SDimitry Andric // TODO: Collect inner loops inside marked outer loops in case
23180b57cec5SDimitry Andric // vectorization fails for the outer loop. Do not invoke
23190b57cec5SDimitry Andric // 'containsIrreducibleCFG' again for inner loops when the outer loop is
23200b57cec5SDimitry Andric // already known to be reducible. We can use an inherited attribute for
23210b57cec5SDimitry Andric // that.
23220b57cec5SDimitry Andric return;
23230b57cec5SDimitry Andric }
23240b57cec5SDimitry Andric }
23250b57cec5SDimitry Andric for (Loop *InnerL : L)
23260b57cec5SDimitry Andric collectSupportedLoops(*InnerL, LI, ORE, V);
23270b57cec5SDimitry Andric }
23280b57cec5SDimitry Andric
23290b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
23300b57cec5SDimitry Andric // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
23310b57cec5SDimitry Andric // LoopVectorizationCostModel and LoopVectorizationPlanner.
23320b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
23330b57cec5SDimitry Andric
233481ad6265SDimitry Andric /// Compute the transformed value of Index at offset StartValue using step
233581ad6265SDimitry Andric /// StepValue.
233681ad6265SDimitry Andric /// For integer induction, returns StartValue + Index * StepValue.
233781ad6265SDimitry Andric /// For pointer induction, returns StartValue[Index * StepValue].
233881ad6265SDimitry Andric /// FIXME: The newly created binary instructions should contain nsw/nuw
233981ad6265SDimitry Andric /// flags, which can be found from the original scalar operations.
2340c9157d92SDimitry Andric static Value *
emitTransformedIndex(IRBuilderBase & B,Value * Index,Value * StartValue,Value * Step,InductionDescriptor::InductionKind InductionKind,const BinaryOperator * InductionBinOp)2341c9157d92SDimitry Andric emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue,
2342c9157d92SDimitry Andric Value *Step,
2343c9157d92SDimitry Andric InductionDescriptor::InductionKind InductionKind,
2344c9157d92SDimitry Andric const BinaryOperator *InductionBinOp) {
2345bdd1243dSDimitry Andric Type *StepTy = Step->getType();
2346bdd1243dSDimitry Andric Value *CastedIndex = StepTy->isIntegerTy()
2347bdd1243dSDimitry Andric ? B.CreateSExtOrTrunc(Index, StepTy)
2348bdd1243dSDimitry Andric : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2349bdd1243dSDimitry Andric if (CastedIndex != Index) {
2350bdd1243dSDimitry Andric CastedIndex->setName(CastedIndex->getName() + ".cast");
2351bdd1243dSDimitry Andric Index = CastedIndex;
2352bdd1243dSDimitry Andric }
235381ad6265SDimitry Andric
235481ad6265SDimitry Andric // Note: the IR at this point is broken. We cannot use SE to create any new
235581ad6265SDimitry Andric // SCEV and then expand it, hoping that SCEV's simplification will give us
235681ad6265SDimitry Andric // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
235781ad6265SDimitry Andric // lead to various SCEV crashes. So all we can do is to use builder and rely
235881ad6265SDimitry Andric // on InstCombine for future simplifications. Here we handle some trivial
235981ad6265SDimitry Andric // cases only.
236081ad6265SDimitry Andric auto CreateAdd = [&B](Value *X, Value *Y) {
236181ad6265SDimitry Andric assert(X->getType() == Y->getType() && "Types don't match!");
236281ad6265SDimitry Andric if (auto *CX = dyn_cast<ConstantInt>(X))
236381ad6265SDimitry Andric if (CX->isZero())
236481ad6265SDimitry Andric return Y;
236581ad6265SDimitry Andric if (auto *CY = dyn_cast<ConstantInt>(Y))
236681ad6265SDimitry Andric if (CY->isZero())
236781ad6265SDimitry Andric return X;
236881ad6265SDimitry Andric return B.CreateAdd(X, Y);
236981ad6265SDimitry Andric };
237081ad6265SDimitry Andric
237181ad6265SDimitry Andric // We allow X to be a vector type, in which case Y will potentially be
237281ad6265SDimitry Andric // splatted into a vector with the same element count.
237381ad6265SDimitry Andric auto CreateMul = [&B](Value *X, Value *Y) {
237481ad6265SDimitry Andric assert(X->getType()->getScalarType() == Y->getType() &&
237581ad6265SDimitry Andric "Types don't match!");
237681ad6265SDimitry Andric if (auto *CX = dyn_cast<ConstantInt>(X))
237781ad6265SDimitry Andric if (CX->isOne())
237881ad6265SDimitry Andric return Y;
237981ad6265SDimitry Andric if (auto *CY = dyn_cast<ConstantInt>(Y))
238081ad6265SDimitry Andric if (CY->isOne())
238181ad6265SDimitry Andric return X;
238281ad6265SDimitry Andric VectorType *XVTy = dyn_cast<VectorType>(X->getType());
238381ad6265SDimitry Andric if (XVTy && !isa<VectorType>(Y->getType()))
238481ad6265SDimitry Andric Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
238581ad6265SDimitry Andric return B.CreateMul(X, Y);
238681ad6265SDimitry Andric };
238781ad6265SDimitry Andric
2388c9157d92SDimitry Andric switch (InductionKind) {
238981ad6265SDimitry Andric case InductionDescriptor::IK_IntInduction: {
239081ad6265SDimitry Andric assert(!isa<VectorType>(Index->getType()) &&
239181ad6265SDimitry Andric "Vector indices not supported for integer inductions yet");
239281ad6265SDimitry Andric assert(Index->getType() == StartValue->getType() &&
239381ad6265SDimitry Andric "Index type does not match StartValue type");
239481ad6265SDimitry Andric if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
239581ad6265SDimitry Andric return B.CreateSub(StartValue, Index);
239681ad6265SDimitry Andric auto *Offset = CreateMul(Index, Step);
239781ad6265SDimitry Andric return CreateAdd(StartValue, Offset);
239881ad6265SDimitry Andric }
2399a58f00eaSDimitry Andric case InductionDescriptor::IK_PtrInduction:
2400a58f00eaSDimitry Andric return B.CreatePtrAdd(StartValue, CreateMul(Index, Step));
240181ad6265SDimitry Andric case InductionDescriptor::IK_FpInduction: {
240281ad6265SDimitry Andric assert(!isa<VectorType>(Index->getType()) &&
240381ad6265SDimitry Andric "Vector indices not supported for FP inductions yet");
240481ad6265SDimitry Andric assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
240581ad6265SDimitry Andric assert(InductionBinOp &&
240681ad6265SDimitry Andric (InductionBinOp->getOpcode() == Instruction::FAdd ||
240781ad6265SDimitry Andric InductionBinOp->getOpcode() == Instruction::FSub) &&
240881ad6265SDimitry Andric "Original bin op should be defined for FP induction");
240981ad6265SDimitry Andric
241081ad6265SDimitry Andric Value *MulExp = B.CreateFMul(Step, Index);
241181ad6265SDimitry Andric return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
241281ad6265SDimitry Andric "induction");
241381ad6265SDimitry Andric }
241481ad6265SDimitry Andric case InductionDescriptor::IK_NoInduction:
241581ad6265SDimitry Andric return nullptr;
241681ad6265SDimitry Andric }
241781ad6265SDimitry Andric llvm_unreachable("invalid enum");
241881ad6265SDimitry Andric }
241981ad6265SDimitry Andric
getMaxVScale(const Function & F,const TargetTransformInfo & TTI)2420fe013be4SDimitry Andric std::optional<unsigned> getMaxVScale(const Function &F,
2421fe013be4SDimitry Andric const TargetTransformInfo &TTI) {
2422fe013be4SDimitry Andric if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2423fe013be4SDimitry Andric return MaxVScale;
2424fe013be4SDimitry Andric
2425fe013be4SDimitry Andric if (F.hasFnAttribute(Attribute::VScaleRange))
2426fe013be4SDimitry Andric return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2427fe013be4SDimitry Andric
2428fe013be4SDimitry Andric return std::nullopt;
2429fe013be4SDimitry Andric }
2430fe013be4SDimitry Andric
2431fe013be4SDimitry Andric /// For the given VF and UF and maximum trip count computed for the loop, return
2432fe013be4SDimitry Andric /// whether the induction variable might overflow in the vectorized loop. If not,
2433fe013be4SDimitry Andric /// then we know a runtime overflow check always evaluates to false and can be
2434fe013be4SDimitry Andric /// removed.
isIndvarOverflowCheckKnownFalse(const LoopVectorizationCostModel * Cost,ElementCount VF,std::optional<unsigned> UF=std::nullopt)2435fe013be4SDimitry Andric static bool isIndvarOverflowCheckKnownFalse(
2436fe013be4SDimitry Andric const LoopVectorizationCostModel *Cost,
2437fe013be4SDimitry Andric ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2438fe013be4SDimitry Andric // Always be conservative if we don't know the exact unroll factor.
2439fe013be4SDimitry Andric unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2440fe013be4SDimitry Andric
2441fe013be4SDimitry Andric Type *IdxTy = Cost->Legal->getWidestInductionType();
2442fe013be4SDimitry Andric APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask();
2443fe013be4SDimitry Andric
2444fe013be4SDimitry Andric // We know the runtime overflow check is known false iff the (max) trip-count
2445fe013be4SDimitry Andric // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2446fe013be4SDimitry Andric // the vector loop induction variable.
2447fe013be4SDimitry Andric if (unsigned TC =
2448fe013be4SDimitry Andric Cost->PSE.getSE()->getSmallConstantMaxTripCount(Cost->TheLoop)) {
2449fe013be4SDimitry Andric uint64_t MaxVF = VF.getKnownMinValue();
2450fe013be4SDimitry Andric if (VF.isScalable()) {
2451fe013be4SDimitry Andric std::optional<unsigned> MaxVScale =
2452fe013be4SDimitry Andric getMaxVScale(*Cost->TheFunction, Cost->TTI);
2453fe013be4SDimitry Andric if (!MaxVScale)
2454fe013be4SDimitry Andric return false;
2455fe013be4SDimitry Andric MaxVF *= *MaxVScale;
2456fe013be4SDimitry Andric }
2457fe013be4SDimitry Andric
2458fe013be4SDimitry Andric return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
2459fe013be4SDimitry Andric }
2460fe013be4SDimitry Andric
2461fe013be4SDimitry Andric return false;
2462fe013be4SDimitry Andric }
2463fe013be4SDimitry Andric
24640b57cec5SDimitry Andric // Return whether we allow using masked interleave-groups (for dealing with
24650b57cec5SDimitry Andric // strided loads/stores that reside in predicated blocks, or for dealing
24660b57cec5SDimitry Andric // with gaps).
useMaskedInterleavedAccesses(const TargetTransformInfo & TTI)24670b57cec5SDimitry Andric static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
24680b57cec5SDimitry Andric // If an override option has been passed in for interleaved accesses, use it.
24690b57cec5SDimitry Andric if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
24700b57cec5SDimitry Andric return EnableMaskedInterleavedMemAccesses;
24710b57cec5SDimitry Andric
24720b57cec5SDimitry Andric return TTI.enableMaskedInterleavedAccessVectorization();
24730b57cec5SDimitry Andric }
24740b57cec5SDimitry Andric
24750b57cec5SDimitry Andric // Try to vectorize the interleave group that \p Instr belongs to.
24760b57cec5SDimitry Andric //
24770b57cec5SDimitry Andric // E.g. Translate following interleaved load group (factor = 3):
24780b57cec5SDimitry Andric // for (i = 0; i < N; i+=3) {
24790b57cec5SDimitry Andric // R = Pic[i]; // Member of index 0
24800b57cec5SDimitry Andric // G = Pic[i+1]; // Member of index 1
24810b57cec5SDimitry Andric // B = Pic[i+2]; // Member of index 2
24820b57cec5SDimitry Andric // ... // do something to R, G, B
24830b57cec5SDimitry Andric // }
24840b57cec5SDimitry Andric // To:
24850b57cec5SDimitry Andric // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2486e8d8bef9SDimitry Andric // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
2487e8d8bef9SDimitry Andric // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
2488e8d8bef9SDimitry Andric // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
24890b57cec5SDimitry Andric //
24900b57cec5SDimitry Andric // Or translate following interleaved store group (factor = 3):
24910b57cec5SDimitry Andric // for (i = 0; i < N; i+=3) {
24920b57cec5SDimitry Andric // ... do something to R, G, B
24930b57cec5SDimitry Andric // Pic[i] = R; // Member of index 0
24940b57cec5SDimitry Andric // Pic[i+1] = G; // Member of index 1
24950b57cec5SDimitry Andric // Pic[i+2] = B; // Member of index 2
24960b57cec5SDimitry Andric // }
24970b57cec5SDimitry Andric // To:
24980b57cec5SDimitry Andric // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2499e8d8bef9SDimitry Andric // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
25000b57cec5SDimitry Andric // %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
25010b57cec5SDimitry Andric // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
25020b57cec5SDimitry Andric // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
vectorizeInterleaveGroup(const InterleaveGroup<Instruction> * Group,ArrayRef<VPValue * > VPDefs,VPTransformState & State,VPValue * Addr,ArrayRef<VPValue * > StoredValues,VPValue * BlockInMask,bool NeedsMaskForGaps)25035ffd83dbSDimitry Andric void InnerLoopVectorizer::vectorizeInterleaveGroup(
2504e8d8bef9SDimitry Andric const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2505e8d8bef9SDimitry Andric VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2506fe013be4SDimitry Andric VPValue *BlockInMask, bool NeedsMaskForGaps) {
25075ffd83dbSDimitry Andric Instruction *Instr = Group->getInsertPos();
25080b57cec5SDimitry Andric const DataLayout &DL = Instr->getModule()->getDataLayout();
25090b57cec5SDimitry Andric
25100b57cec5SDimitry Andric // Prepare for the vector type of the interleaved load/store.
2511fe6060f1SDimitry Andric Type *ScalarTy = getLoadStoreType(Instr);
25120b57cec5SDimitry Andric unsigned InterleaveFactor = Group->getFactor();
2513e8d8bef9SDimitry Andric auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
25140b57cec5SDimitry Andric
25150b57cec5SDimitry Andric // Prepare for the new pointers.
2516480093f4SDimitry Andric SmallVector<Value *, 2> AddrParts;
25170b57cec5SDimitry Andric unsigned Index = Group->getIndex(Instr);
25180b57cec5SDimitry Andric
25190b57cec5SDimitry Andric // TODO: extend the masked interleaved-group support to reversed access.
2520480093f4SDimitry Andric assert((!BlockInMask || !Group->isReverse()) &&
2521480093f4SDimitry Andric "Reversed masked interleave-group not supported.");
25220b57cec5SDimitry Andric
2523fe013be4SDimitry Andric Value *Idx;
25240b57cec5SDimitry Andric // If the group is reverse, adjust the index to refer to the last vector lane
25250b57cec5SDimitry Andric // instead of the first. We adjust the index from the first vector lane,
25260b57cec5SDimitry Andric // rather than directly getting the pointer for lane VF - 1, because the
25270b57cec5SDimitry Andric // pointer operand of the interleaved access is supposed to be uniform. For
25280b57cec5SDimitry Andric // uniform instructions, we're only required to generate a value for the
25290b57cec5SDimitry Andric // first vector lane in each unroll iteration.
2530fe013be4SDimitry Andric if (Group->isReverse()) {
2531fe013be4SDimitry Andric Value *RuntimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF);
2532fe013be4SDimitry Andric Idx = Builder.CreateSub(RuntimeVF, Builder.getInt32(1));
2533fe013be4SDimitry Andric Idx = Builder.CreateMul(Idx, Builder.getInt32(Group->getFactor()));
2534fe013be4SDimitry Andric Idx = Builder.CreateAdd(Idx, Builder.getInt32(Index));
2535fe013be4SDimitry Andric Idx = Builder.CreateNeg(Idx);
2536fe013be4SDimitry Andric } else
2537fe013be4SDimitry Andric Idx = Builder.getInt32(-Index);
25380b57cec5SDimitry Andric
25390b57cec5SDimitry Andric for (unsigned Part = 0; Part < UF; Part++) {
2540fe6060f1SDimitry Andric Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2541c9157d92SDimitry Andric if (auto *I = dyn_cast<Instruction>(AddrPart))
2542c9157d92SDimitry Andric State.setDebugLocFrom(I->getDebugLoc());
25430b57cec5SDimitry Andric
25440b57cec5SDimitry Andric // Notice current instruction could be any index. Need to adjust the address
25450b57cec5SDimitry Andric // to the member of index 0.
25460b57cec5SDimitry Andric //
25470b57cec5SDimitry Andric // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
25480b57cec5SDimitry Andric // b = A[i]; // Member of index 0
25490b57cec5SDimitry Andric // Current pointer is pointed to A[i+1], adjust it to A[i].
25500b57cec5SDimitry Andric //
25510b57cec5SDimitry Andric // E.g. A[i+1] = a; // Member of index 1
25520b57cec5SDimitry Andric // A[i] = b; // Member of index 0
25530b57cec5SDimitry Andric // A[i+2] = c; // Member of index 2 (Current instruction)
25540b57cec5SDimitry Andric // Current pointer is pointed to A[i+2], adjust it to A[i].
2555480093f4SDimitry Andric
2556480093f4SDimitry Andric bool InBounds = false;
2557480093f4SDimitry Andric if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2558480093f4SDimitry Andric InBounds = gep->isInBounds();
2559fe013be4SDimitry Andric AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds);
2560c9157d92SDimitry Andric AddrParts.push_back(AddrPart);
25610b57cec5SDimitry Andric }
25620b57cec5SDimitry Andric
2563c9157d92SDimitry Andric State.setDebugLocFrom(Instr->getDebugLoc());
2564e8d8bef9SDimitry Andric Value *PoisonVec = PoisonValue::get(VecTy);
25650b57cec5SDimitry Andric
2566fe013be4SDimitry Andric auto CreateGroupMask = [this, &BlockInMask, &State, &InterleaveFactor](
2567fe013be4SDimitry Andric unsigned Part, Value *MaskForGaps) -> Value * {
2568fe013be4SDimitry Andric if (VF.isScalable()) {
2569fe013be4SDimitry Andric assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
2570fe013be4SDimitry Andric assert(InterleaveFactor == 2 &&
2571fe013be4SDimitry Andric "Unsupported deinterleave factor for scalable vectors");
2572fe013be4SDimitry Andric auto *BlockInMaskPart = State.get(BlockInMask, Part);
2573fe013be4SDimitry Andric SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart};
2574fe013be4SDimitry Andric auto *MaskTy =
2575fe013be4SDimitry Andric VectorType::get(Builder.getInt1Ty(), VF.getKnownMinValue() * 2, true);
2576fe013be4SDimitry Andric return Builder.CreateIntrinsic(
2577fe013be4SDimitry Andric MaskTy, Intrinsic::experimental_vector_interleave2, Ops,
2578fe013be4SDimitry Andric /*FMFSource=*/nullptr, "interleaved.mask");
25790b57cec5SDimitry Andric }
25800b57cec5SDimitry Andric
2581fe013be4SDimitry Andric if (!BlockInMask)
2582fe013be4SDimitry Andric return MaskForGaps;
2583fe013be4SDimitry Andric
2584fe013be4SDimitry Andric Value *BlockInMaskPart = State.get(BlockInMask, Part);
2585fe013be4SDimitry Andric Value *ShuffledMask = Builder.CreateShuffleVector(
2586fe013be4SDimitry Andric BlockInMaskPart,
2587fe013be4SDimitry Andric createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2588fe013be4SDimitry Andric "interleaved.mask");
2589fe013be4SDimitry Andric return MaskForGaps ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2590fe013be4SDimitry Andric MaskForGaps)
2591fe013be4SDimitry Andric : ShuffledMask;
2592fe013be4SDimitry Andric };
2593fe013be4SDimitry Andric
25940b57cec5SDimitry Andric // Vectorize the interleaved load group.
25950b57cec5SDimitry Andric if (isa<LoadInst>(Instr)) {
2596fe013be4SDimitry Andric Value *MaskForGaps = nullptr;
2597fe013be4SDimitry Andric if (NeedsMaskForGaps) {
2598fe013be4SDimitry Andric MaskForGaps =
2599fe013be4SDimitry Andric createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2600fe013be4SDimitry Andric assert(MaskForGaps && "Mask for Gaps is required but it is null");
2601fe013be4SDimitry Andric }
2602fe013be4SDimitry Andric
26030b57cec5SDimitry Andric // For each unroll part, create a wide load for the group.
26040b57cec5SDimitry Andric SmallVector<Value *, 2> NewLoads;
26050b57cec5SDimitry Andric for (unsigned Part = 0; Part < UF; Part++) {
26060b57cec5SDimitry Andric Instruction *NewLoad;
2607480093f4SDimitry Andric if (BlockInMask || MaskForGaps) {
26080b57cec5SDimitry Andric assert(useMaskedInterleavedAccesses(*TTI) &&
26090b57cec5SDimitry Andric "masked interleaved groups are not allowed.");
2610fe013be4SDimitry Andric Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
26110b57cec5SDimitry Andric NewLoad =
2612fe6060f1SDimitry Andric Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2613e8d8bef9SDimitry Andric GroupMask, PoisonVec, "wide.masked.vec");
26140b57cec5SDimitry Andric }
26150b57cec5SDimitry Andric else
2616480093f4SDimitry Andric NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
26175ffd83dbSDimitry Andric Group->getAlign(), "wide.vec");
26180b57cec5SDimitry Andric Group->addMetadata(NewLoad);
26190b57cec5SDimitry Andric NewLoads.push_back(NewLoad);
26200b57cec5SDimitry Andric }
26210b57cec5SDimitry Andric
2622fe013be4SDimitry Andric if (VecTy->isScalableTy()) {
2623fe013be4SDimitry Andric assert(InterleaveFactor == 2 &&
2624fe013be4SDimitry Andric "Unsupported deinterleave factor for scalable vectors");
2625fe013be4SDimitry Andric
2626fe013be4SDimitry Andric for (unsigned Part = 0; Part < UF; ++Part) {
2627fe013be4SDimitry Andric // Scalable vectors cannot use arbitrary shufflevectors (only splats),
2628fe013be4SDimitry Andric // so must use intrinsics to deinterleave.
2629fe013be4SDimitry Andric Value *DI = Builder.CreateIntrinsic(
2630fe013be4SDimitry Andric Intrinsic::experimental_vector_deinterleave2, VecTy, NewLoads[Part],
2631fe013be4SDimitry Andric /*FMFSource=*/nullptr, "strided.vec");
2632fe013be4SDimitry Andric unsigned J = 0;
2633fe013be4SDimitry Andric for (unsigned I = 0; I < InterleaveFactor; ++I) {
2634fe013be4SDimitry Andric Instruction *Member = Group->getMember(I);
2635fe013be4SDimitry Andric
2636fe013be4SDimitry Andric if (!Member)
2637fe013be4SDimitry Andric continue;
2638fe013be4SDimitry Andric
2639fe013be4SDimitry Andric Value *StridedVec = Builder.CreateExtractValue(DI, I);
2640fe013be4SDimitry Andric // If this member has different type, cast the result type.
2641fe013be4SDimitry Andric if (Member->getType() != ScalarTy) {
2642fe013be4SDimitry Andric VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2643fe013be4SDimitry Andric StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2644fe013be4SDimitry Andric }
2645fe013be4SDimitry Andric
2646fe013be4SDimitry Andric if (Group->isReverse())
2647fe013be4SDimitry Andric StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2648fe013be4SDimitry Andric
2649fe013be4SDimitry Andric State.set(VPDefs[J], StridedVec, Part);
2650fe013be4SDimitry Andric ++J;
2651fe013be4SDimitry Andric }
2652fe013be4SDimitry Andric }
2653fe013be4SDimitry Andric
2654fe013be4SDimitry Andric return;
2655fe013be4SDimitry Andric }
2656fe013be4SDimitry Andric
26570b57cec5SDimitry Andric // For each member in the group, shuffle out the appropriate data from the
26580b57cec5SDimitry Andric // wide loads.
2659e8d8bef9SDimitry Andric unsigned J = 0;
26600b57cec5SDimitry Andric for (unsigned I = 0; I < InterleaveFactor; ++I) {
26610b57cec5SDimitry Andric Instruction *Member = Group->getMember(I);
26620b57cec5SDimitry Andric
26630b57cec5SDimitry Andric // Skip the gaps in the group.
26640b57cec5SDimitry Andric if (!Member)
26650b57cec5SDimitry Andric continue;
26660b57cec5SDimitry Andric
2667e8d8bef9SDimitry Andric auto StrideMask =
2668e8d8bef9SDimitry Andric createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
26690b57cec5SDimitry Andric for (unsigned Part = 0; Part < UF; Part++) {
26700b57cec5SDimitry Andric Value *StridedVec = Builder.CreateShuffleVector(
2671e8d8bef9SDimitry Andric NewLoads[Part], StrideMask, "strided.vec");
26720b57cec5SDimitry Andric
26730b57cec5SDimitry Andric // If this member has different type, cast the result type.
26740b57cec5SDimitry Andric if (Member->getType() != ScalarTy) {
2675e8d8bef9SDimitry Andric assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2676e8d8bef9SDimitry Andric VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
26770b57cec5SDimitry Andric StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
26780b57cec5SDimitry Andric }
26790b57cec5SDimitry Andric
26800b57cec5SDimitry Andric if (Group->isReverse())
268104eeddc0SDimitry Andric StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
26820b57cec5SDimitry Andric
2683fe6060f1SDimitry Andric State.set(VPDefs[J], StridedVec, Part);
26840b57cec5SDimitry Andric }
2685e8d8bef9SDimitry Andric ++J;
26860b57cec5SDimitry Andric }
26870b57cec5SDimitry Andric return;
26880b57cec5SDimitry Andric }
26890b57cec5SDimitry Andric
26900b57cec5SDimitry Andric // The sub vector type for current instruction.
2691e8d8bef9SDimitry Andric auto *SubVT = VectorType::get(ScalarTy, VF);
26920b57cec5SDimitry Andric
26930b57cec5SDimitry Andric // Vectorize the interleaved store group.
2694fe013be4SDimitry Andric Value *MaskForGaps =
2695fe013be4SDimitry Andric createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2696349cc55cSDimitry Andric assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
2697349cc55cSDimitry Andric "masked interleaved groups are not allowed.");
2698349cc55cSDimitry Andric assert((!MaskForGaps || !VF.isScalable()) &&
2699349cc55cSDimitry Andric "masking gaps for scalable vectors is not yet supported.");
27000b57cec5SDimitry Andric for (unsigned Part = 0; Part < UF; Part++) {
27010b57cec5SDimitry Andric // Collect the stored vector from each member.
27020b57cec5SDimitry Andric SmallVector<Value *, 4> StoredVecs;
2703bdd1243dSDimitry Andric unsigned StoredIdx = 0;
27040b57cec5SDimitry Andric for (unsigned i = 0; i < InterleaveFactor; i++) {
2705349cc55cSDimitry Andric assert((Group->getMember(i) || MaskForGaps) &&
2706349cc55cSDimitry Andric "Fail to get a member from an interleaved store group");
2707349cc55cSDimitry Andric Instruction *Member = Group->getMember(i);
2708349cc55cSDimitry Andric
2709349cc55cSDimitry Andric // Skip the gaps in the group.
2710349cc55cSDimitry Andric if (!Member) {
2711349cc55cSDimitry Andric Value *Undef = PoisonValue::get(SubVT);
2712349cc55cSDimitry Andric StoredVecs.push_back(Undef);
2713349cc55cSDimitry Andric continue;
2714349cc55cSDimitry Andric }
27150b57cec5SDimitry Andric
2716bdd1243dSDimitry Andric Value *StoredVec = State.get(StoredValues[StoredIdx], Part);
2717bdd1243dSDimitry Andric ++StoredIdx;
2718e8d8bef9SDimitry Andric
27190b57cec5SDimitry Andric if (Group->isReverse())
272004eeddc0SDimitry Andric StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");
27210b57cec5SDimitry Andric
27220b57cec5SDimitry Andric // If this member has different type, cast it to a unified type.
27230b57cec5SDimitry Andric
27240b57cec5SDimitry Andric if (StoredVec->getType() != SubVT)
27250b57cec5SDimitry Andric StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
27260b57cec5SDimitry Andric
27270b57cec5SDimitry Andric StoredVecs.push_back(StoredVec);
27280b57cec5SDimitry Andric }
27290b57cec5SDimitry Andric
2730fe013be4SDimitry Andric // Interleave all the smaller vectors into one wider vector.
2731fe013be4SDimitry Andric Value *IVec = interleaveVectors(Builder, StoredVecs, "interleaved.vec");
27320b57cec5SDimitry Andric Instruction *NewStoreInstr;
2733349cc55cSDimitry Andric if (BlockInMask || MaskForGaps) {
2734fe013be4SDimitry Andric Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2735349cc55cSDimitry Andric NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2736349cc55cSDimitry Andric Group->getAlign(), GroupMask);
2737349cc55cSDimitry Andric } else
27385ffd83dbSDimitry Andric NewStoreInstr =
27395ffd83dbSDimitry Andric Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
27400b57cec5SDimitry Andric
27410b57cec5SDimitry Andric Group->addMetadata(NewStoreInstr);
27420b57cec5SDimitry Andric }
27430b57cec5SDimitry Andric }
27440b57cec5SDimitry Andric
scalarizeInstruction(const Instruction * Instr,VPReplicateRecipe * RepRecipe,const VPIteration & Instance,VPTransformState & State)2745bdd1243dSDimitry Andric void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
27464824e7fdSDimitry Andric VPReplicateRecipe *RepRecipe,
27470b57cec5SDimitry Andric const VPIteration &Instance,
27485ffd83dbSDimitry Andric VPTransformState &State) {
27490b57cec5SDimitry Andric assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
27500b57cec5SDimitry Andric
2751e8d8bef9SDimitry Andric // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2752e8d8bef9SDimitry Andric // the first lane and part.
2753e8d8bef9SDimitry Andric if (isa<NoAliasScopeDeclInst>(Instr))
2754fe6060f1SDimitry Andric if (!Instance.isFirstIteration())
2755e8d8bef9SDimitry Andric return;
2756e8d8bef9SDimitry Andric
27570b57cec5SDimitry Andric // Does this instruction return a value ?
27580b57cec5SDimitry Andric bool IsVoidRetTy = Instr->getType()->isVoidTy();
27590b57cec5SDimitry Andric
27600b57cec5SDimitry Andric Instruction *Cloned = Instr->clone();
2761c9157d92SDimitry Andric if (!IsVoidRetTy) {
27620b57cec5SDimitry Andric Cloned->setName(Instr->getName() + ".cloned");
2763c9157d92SDimitry Andric #if !defined(NDEBUG)
2764c9157d92SDimitry Andric // Verify that VPlan type inference results agree with the type of the
2765c9157d92SDimitry Andric // generated values.
2766c9157d92SDimitry Andric assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() &&
2767c9157d92SDimitry Andric "inferred type and type from generated instructions do not match");
2768c9157d92SDimitry Andric #endif
2769c9157d92SDimitry Andric }
27700b57cec5SDimitry Andric
2771fe013be4SDimitry Andric RepRecipe->setFlags(Cloned);
27724824e7fdSDimitry Andric
2773c9157d92SDimitry Andric if (auto DL = Instr->getDebugLoc())
2774c9157d92SDimitry Andric State.setDebugLocFrom(DL);
277581ad6265SDimitry Andric
27760b57cec5SDimitry Andric // Replace the operands of the cloned instructions with their scalar
27770b57cec5SDimitry Andric // equivalents in the new loop.
2778bdd1243dSDimitry Andric for (const auto &I : enumerate(RepRecipe->operands())) {
2779e8d8bef9SDimitry Andric auto InputInstance = Instance;
27800eae32dcSDimitry Andric VPValue *Operand = I.value();
2781bdd1243dSDimitry Andric if (vputils::isUniformAfterVectorization(Operand))
2782fe6060f1SDimitry Andric InputInstance.Lane = VPLane::getFirstLane();
27830eae32dcSDimitry Andric Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
27840b57cec5SDimitry Andric }
278581ad6265SDimitry Andric State.addNewMetadata(Cloned, Instr);
27860b57cec5SDimitry Andric
27870b57cec5SDimitry Andric // Place the cloned scalar in the new loop.
278881ad6265SDimitry Andric State.Builder.Insert(Cloned);
27890b57cec5SDimitry Andric
27904824e7fdSDimitry Andric State.set(RepRecipe, Cloned, Instance);
27910b57cec5SDimitry Andric
27920b57cec5SDimitry Andric // If we just cloned a new assumption, add it the assumption cache.
2793fe6060f1SDimitry Andric if (auto *II = dyn_cast<AssumeInst>(Cloned))
27940b57cec5SDimitry Andric AC->registerAssumption(II);
27950b57cec5SDimitry Andric
27960b57cec5SDimitry Andric // End if-block.
2797fe013be4SDimitry Andric bool IfPredicateInstr = RepRecipe->getParent()->getParent()->isReplicator();
27980b57cec5SDimitry Andric if (IfPredicateInstr)
27990b57cec5SDimitry Andric PredicatedInstructions.push_back(Cloned);
28000b57cec5SDimitry Andric }
28010b57cec5SDimitry Andric
280281ad6265SDimitry Andric Value *
getOrCreateVectorTripCount(BasicBlock * InsertBlock)280381ad6265SDimitry Andric InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
28040b57cec5SDimitry Andric if (VectorTripCount)
28050b57cec5SDimitry Andric return VectorTripCount;
28060b57cec5SDimitry Andric
2807fe013be4SDimitry Andric Value *TC = getTripCount();
280881ad6265SDimitry Andric IRBuilder<> Builder(InsertBlock->getTerminator());
28090b57cec5SDimitry Andric
28100b57cec5SDimitry Andric Type *Ty = TC->getType();
2811e8d8bef9SDimitry Andric // This is where we can make the step a runtime constant.
2812349cc55cSDimitry Andric Value *Step = createStepForVF(Builder, Ty, VF, UF);
28130b57cec5SDimitry Andric
28140b57cec5SDimitry Andric // If the tail is to be folded by masking, round the number of iterations N
28150b57cec5SDimitry Andric // up to a multiple of Step instead of rounding down. This is done by first
28160b57cec5SDimitry Andric // adding Step-1 and then rounding down. Note that it's ok if this addition
28170b57cec5SDimitry Andric // overflows: the vector induction variable will eventually wrap to zero given
28180b57cec5SDimitry Andric // that it starts at zero and its Step is a power of two; the loop will then
28190b57cec5SDimitry Andric // exit, with the last early-exit vector comparison also producing all-true.
282081ad6265SDimitry Andric // For scalable vectors the VF is not guaranteed to be a power of 2, but this
282181ad6265SDimitry Andric // is accounted for in emitIterationCountCheck that adds an overflow check.
28220b57cec5SDimitry Andric if (Cost->foldTailByMasking()) {
2823e8d8bef9SDimitry Andric assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
28240b57cec5SDimitry Andric "VF*UF must be a power of 2 when folding tail by masking");
282504eeddc0SDimitry Andric Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF);
2826e8d8bef9SDimitry Andric TC = Builder.CreateAdd(
282704eeddc0SDimitry Andric TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up");
28280b57cec5SDimitry Andric }
28290b57cec5SDimitry Andric
28300b57cec5SDimitry Andric // Now we need to generate the expression for the part of the loop that the
28310b57cec5SDimitry Andric // vectorized body will execute. This is equal to N - (N % Step) if scalar
28320b57cec5SDimitry Andric // iterations are not required for correctness, or N - Step, otherwise. Step
28330b57cec5SDimitry Andric // is equal to the vectorization factor (number of SIMD elements) times the
28340b57cec5SDimitry Andric // unroll factor (number of SIMD instructions).
28350b57cec5SDimitry Andric Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
28360b57cec5SDimitry Andric
2837fe6060f1SDimitry Andric // There are cases where we *must* run at least one iteration in the remainder
2838fe6060f1SDimitry Andric // loop. See the cost model for when this can happen. If the step evenly
2839fe6060f1SDimitry Andric // divides the trip count, we set the remainder to be equal to the step. If
2840fe6060f1SDimitry Andric // the step does not evenly divide the trip count, no adjustment is necessary
2841fe6060f1SDimitry Andric // since there will already be scalar iterations. Note that the minimum
2842fe6060f1SDimitry Andric // iterations check ensures that N >= Step.
2843fe013be4SDimitry Andric if (Cost->requiresScalarEpilogue(VF.isVector())) {
28440b57cec5SDimitry Andric auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
28450b57cec5SDimitry Andric R = Builder.CreateSelect(IsZero, Step, R);
28460b57cec5SDimitry Andric }
28470b57cec5SDimitry Andric
28480b57cec5SDimitry Andric VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
28490b57cec5SDimitry Andric
28500b57cec5SDimitry Andric return VectorTripCount;
28510b57cec5SDimitry Andric }
28520b57cec5SDimitry Andric
createBitOrPointerCast(Value * V,VectorType * DstVTy,const DataLayout & DL)28530b57cec5SDimitry Andric Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
28540b57cec5SDimitry Andric const DataLayout &DL) {
28550b57cec5SDimitry Andric // Verify that V is a vector type with same number of elements as DstVTy.
2856fe013be4SDimitry Andric auto *DstFVTy = cast<VectorType>(DstVTy);
2857fe013be4SDimitry Andric auto VF = DstFVTy->getElementCount();
2858fe013be4SDimitry Andric auto *SrcVecTy = cast<VectorType>(V->getType());
2859fe013be4SDimitry Andric assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
28600b57cec5SDimitry Andric Type *SrcElemTy = SrcVecTy->getElementType();
2861e8d8bef9SDimitry Andric Type *DstElemTy = DstFVTy->getElementType();
28620b57cec5SDimitry Andric assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
28630b57cec5SDimitry Andric "Vector elements must have same size");
28640b57cec5SDimitry Andric
28650b57cec5SDimitry Andric // Do a direct cast if element types are castable.
28660b57cec5SDimitry Andric if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2867e8d8bef9SDimitry Andric return Builder.CreateBitOrPointerCast(V, DstFVTy);
28680b57cec5SDimitry Andric }
28690b57cec5SDimitry Andric // V cannot be directly casted to desired vector type.
28700b57cec5SDimitry Andric // May happen when V is a floating point vector but DstVTy is a vector of
28710b57cec5SDimitry Andric // pointers or vice-versa. Handle this using a two-step bitcast using an
28720b57cec5SDimitry Andric // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
28730b57cec5SDimitry Andric assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
28740b57cec5SDimitry Andric "Only one type should be a pointer type");
28750b57cec5SDimitry Andric assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
28760b57cec5SDimitry Andric "Only one type should be a floating point type");
28770b57cec5SDimitry Andric Type *IntTy =
28780b57cec5SDimitry Andric IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2879fe013be4SDimitry Andric auto *VecIntTy = VectorType::get(IntTy, VF);
28800b57cec5SDimitry Andric Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2881e8d8bef9SDimitry Andric return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
28820b57cec5SDimitry Andric }
28830b57cec5SDimitry Andric
emitIterationCountCheck(BasicBlock * Bypass)288481ad6265SDimitry Andric void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
2885fe013be4SDimitry Andric Value *Count = getTripCount();
2886480093f4SDimitry Andric // Reuse existing vector loop preheader for TC checks.
2887480093f4SDimitry Andric // Note that new preheader block is generated for vector loop.
2888480093f4SDimitry Andric BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2889480093f4SDimitry Andric IRBuilder<> Builder(TCCheckBlock->getTerminator());
28900b57cec5SDimitry Andric
28910b57cec5SDimitry Andric // Generate code to check if the loop's trip count is less than VF * UF, or
28920b57cec5SDimitry Andric // equal to it in case a scalar epilogue is required; this implies that the
28930b57cec5SDimitry Andric // vector trip count is zero. This check also covers the case where adding one
28940b57cec5SDimitry Andric // to the backedge-taken count overflowed leading to an incorrect trip count
28950b57cec5SDimitry Andric // of zero. In this case we will also jump to the scalar loop.
2896fe013be4SDimitry Andric auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
28970b57cec5SDimitry Andric : ICmpInst::ICMP_ULT;
28980b57cec5SDimitry Andric
28990b57cec5SDimitry Andric // If tail is to be folded, vector loop takes care of all iterations.
290081ad6265SDimitry Andric Type *CountTy = Count->getType();
29010b57cec5SDimitry Andric Value *CheckMinIters = Builder.getFalse();
2902fcaf7f86SDimitry Andric auto CreateStep = [&]() -> Value * {
2903753f127fSDimitry Andric // Create step with max(MinProTripCount, UF * VF).
2904fcaf7f86SDimitry Andric if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
2905753f127fSDimitry Andric return createStepForVF(Builder, CountTy, VF, UF);
2906fcaf7f86SDimitry Andric
2907fcaf7f86SDimitry Andric Value *MinProfTC =
2908fcaf7f86SDimitry Andric createStepForVF(Builder, CountTy, MinProfitableTripCount, 1);
2909fcaf7f86SDimitry Andric if (!VF.isScalable())
2910fcaf7f86SDimitry Andric return MinProfTC;
2911fcaf7f86SDimitry Andric return Builder.CreateBinaryIntrinsic(
2912fcaf7f86SDimitry Andric Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2913753f127fSDimitry Andric };
2914753f127fSDimitry Andric
2915fe013be4SDimitry Andric TailFoldingStyle Style = Cost->getTailFoldingStyle();
2916fe013be4SDimitry Andric if (Style == TailFoldingStyle::None)
2917753f127fSDimitry Andric CheckMinIters =
2918753f127fSDimitry Andric Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check");
2919fe013be4SDimitry Andric else if (VF.isScalable() &&
2920fe013be4SDimitry Andric !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) &&
2921fe013be4SDimitry Andric Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
292281ad6265SDimitry Andric // vscale is not necessarily a power-of-2, which means we cannot guarantee
292381ad6265SDimitry Andric // an overflow to zero when updating induction variables and so an
292481ad6265SDimitry Andric // additional overflow check is required before entering the vector loop.
292581ad6265SDimitry Andric
292681ad6265SDimitry Andric // Get the maximum unsigned value for the type.
292781ad6265SDimitry Andric Value *MaxUIntTripCount =
292881ad6265SDimitry Andric ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
292981ad6265SDimitry Andric Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
293081ad6265SDimitry Andric
293181ad6265SDimitry Andric // Don't execute the vector loop if (UMax - n) < (VF * UF).
2932753f127fSDimitry Andric CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
2933e8d8bef9SDimitry Andric }
2934753f127fSDimitry Andric
2935480093f4SDimitry Andric // Create new preheader for vector loop.
2936480093f4SDimitry Andric LoopVectorPreHeader =
2937480093f4SDimitry Andric SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2938480093f4SDimitry Andric "vector.ph");
2939480093f4SDimitry Andric
2940480093f4SDimitry Andric assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2941480093f4SDimitry Andric DT->getNode(Bypass)->getIDom()) &&
2942480093f4SDimitry Andric "TC check is expected to dominate Bypass");
2943480093f4SDimitry Andric
2944fe6060f1SDimitry Andric // Update dominator for Bypass & LoopExit (if needed).
2945480093f4SDimitry Andric DT->changeImmediateDominator(Bypass, TCCheckBlock);
2946fe013be4SDimitry Andric if (!Cost->requiresScalarEpilogue(VF.isVector()))
2947fe6060f1SDimitry Andric // If there is an epilogue which must run, there's no edge from the
2948fe6060f1SDimitry Andric // middle block to exit blocks and thus no need to update the immediate
2949fe6060f1SDimitry Andric // dominator of the exit blocks.
2950480093f4SDimitry Andric DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
2951480093f4SDimitry Andric
2952c9157d92SDimitry Andric BranchInst &BI =
2953c9157d92SDimitry Andric *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
2954c9157d92SDimitry Andric if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
2955c9157d92SDimitry Andric setBranchWeights(BI, MinItersBypassWeights);
2956c9157d92SDimitry Andric ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
2957480093f4SDimitry Andric LoopBypassBlocks.push_back(TCCheckBlock);
29580b57cec5SDimitry Andric }
29590b57cec5SDimitry Andric
emitSCEVChecks(BasicBlock * Bypass)296081ad6265SDimitry Andric BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
2961fe6060f1SDimitry Andric BasicBlock *const SCEVCheckBlock =
296281ad6265SDimitry Andric RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
2963fe6060f1SDimitry Andric if (!SCEVCheckBlock)
2964fe6060f1SDimitry Andric return nullptr;
29650b57cec5SDimitry Andric
2966e8d8bef9SDimitry Andric assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2967e8d8bef9SDimitry Andric (OptForSizeBasedOnProfile &&
2968e8d8bef9SDimitry Andric Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
29698bcb0991SDimitry Andric "Cannot SCEV check stride or overflow when optimizing for size");
29708bcb0991SDimitry Andric
2971480093f4SDimitry Andric
2972480093f4SDimitry Andric // Update dominator only if this is first RT check.
2973480093f4SDimitry Andric if (LoopBypassBlocks.empty()) {
2974480093f4SDimitry Andric DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2975fe013be4SDimitry Andric if (!Cost->requiresScalarEpilogue(VF.isVector()))
2976fe6060f1SDimitry Andric // If there is an epilogue which must run, there's no edge from the
2977fe6060f1SDimitry Andric // middle block to exit blocks and thus no need to update the immediate
2978fe6060f1SDimitry Andric // dominator of the exit blocks.
2979480093f4SDimitry Andric DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2980480093f4SDimitry Andric }
2981480093f4SDimitry Andric
2982480093f4SDimitry Andric LoopBypassBlocks.push_back(SCEVCheckBlock);
29830b57cec5SDimitry Andric AddedSafetyChecks = true;
2984fe6060f1SDimitry Andric return SCEVCheckBlock;
29850b57cec5SDimitry Andric }
29860b57cec5SDimitry Andric
emitMemRuntimeChecks(BasicBlock * Bypass)298781ad6265SDimitry Andric BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
29880b57cec5SDimitry Andric // VPlan-native path does not do any analysis for runtime checks currently.
29890b57cec5SDimitry Andric if (EnableVPlanNativePath)
2990fe6060f1SDimitry Andric return nullptr;
29910b57cec5SDimitry Andric
2992fe6060f1SDimitry Andric BasicBlock *const MemCheckBlock =
299381ad6265SDimitry Andric RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
29940b57cec5SDimitry Andric
2995fe6060f1SDimitry Andric // Check if we generated code that checks in runtime if arrays overlap. We put
2996fe6060f1SDimitry Andric // the checks into a separate block to make the more common case of few
2997fe6060f1SDimitry Andric // elements faster.
2998fe6060f1SDimitry Andric if (!MemCheckBlock)
2999fe6060f1SDimitry Andric return nullptr;
30000b57cec5SDimitry Andric
3001e8d8bef9SDimitry Andric if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
30028bcb0991SDimitry Andric assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
30038bcb0991SDimitry Andric "Cannot emit memory checks when optimizing for size, unless forced "
30048bcb0991SDimitry Andric "to vectorize.");
30058bcb0991SDimitry Andric ORE->emit([&]() {
30068bcb0991SDimitry Andric return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
300781ad6265SDimitry Andric OrigLoop->getStartLoc(),
300881ad6265SDimitry Andric OrigLoop->getHeader())
30098bcb0991SDimitry Andric << "Code-size may be reduced by not forcing "
30108bcb0991SDimitry Andric "vectorization, or by source-code modifications "
30118bcb0991SDimitry Andric "eliminating the need for runtime checks "
30128bcb0991SDimitry Andric "(e.g., adding 'restrict').";
30138bcb0991SDimitry Andric });
30148bcb0991SDimitry Andric }
30158bcb0991SDimitry Andric
3016e8d8bef9SDimitry Andric LoopBypassBlocks.push_back(MemCheckBlock);
3017fe6060f1SDimitry Andric
3018e8d8bef9SDimitry Andric AddedSafetyChecks = true;
3019e8d8bef9SDimitry Andric
3020fe6060f1SDimitry Andric return MemCheckBlock;
30210b57cec5SDimitry Andric }
30220b57cec5SDimitry Andric
createVectorLoopSkeleton(StringRef Prefix)302381ad6265SDimitry Andric void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3024e8d8bef9SDimitry Andric LoopScalarBody = OrigLoop->getHeader();
3025e8d8bef9SDimitry Andric LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3026e8d8bef9SDimitry Andric assert(LoopVectorPreHeader && "Invalid loop structure");
3027fe6060f1SDimitry Andric LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
3028fe013be4SDimitry Andric assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) &&
3029fe6060f1SDimitry Andric "multiple exit loop without required epilogue?");
3030e8d8bef9SDimitry Andric
3031e8d8bef9SDimitry Andric LoopMiddleBlock =
3032e8d8bef9SDimitry Andric SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3033e8d8bef9SDimitry Andric LI, nullptr, Twine(Prefix) + "middle.block");
3034e8d8bef9SDimitry Andric LoopScalarPreHeader =
3035e8d8bef9SDimitry Andric SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3036e8d8bef9SDimitry Andric nullptr, Twine(Prefix) + "scalar.ph");
3037e8d8bef9SDimitry Andric
3038e8d8bef9SDimitry Andric auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3039fe6060f1SDimitry Andric
3040fe6060f1SDimitry Andric // Set up the middle block terminator. Two cases:
3041fe6060f1SDimitry Andric // 1) If we know that we must execute the scalar epilogue, emit an
3042fe6060f1SDimitry Andric // unconditional branch.
3043fe6060f1SDimitry Andric // 2) Otherwise, we must have a single unique exit block (due to how we
3044bdd1243dSDimitry Andric // implement the multiple exit case). In this case, set up a conditional
3045fe6060f1SDimitry Andric // branch from the middle block to the loop scalar preheader, and the
3046fe6060f1SDimitry Andric // exit block. completeLoopSkeleton will update the condition to use an
3047fe6060f1SDimitry Andric // iteration check, if required to decide whether to execute the remainder.
3048fe013be4SDimitry Andric BranchInst *BrInst =
3049fe013be4SDimitry Andric Cost->requiresScalarEpilogue(VF.isVector())
3050fe013be4SDimitry Andric ? BranchInst::Create(LoopScalarPreHeader)
3051fe013be4SDimitry Andric : BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
3052fe6060f1SDimitry Andric Builder.getTrue());
3053e8d8bef9SDimitry Andric BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3054e8d8bef9SDimitry Andric ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3055e8d8bef9SDimitry Andric
305681ad6265SDimitry Andric // Update dominator for loop exit. During skeleton creation, only the vector
305781ad6265SDimitry Andric // pre-header and the middle block are created. The vector loop is entirely
305881ad6265SDimitry Andric // created during VPlan exection.
3059fe013be4SDimitry Andric if (!Cost->requiresScalarEpilogue(VF.isVector()))
3060fe6060f1SDimitry Andric // If there is an epilogue which must run, there's no edge from the
3061fe6060f1SDimitry Andric // middle block to exit blocks and thus no need to update the immediate
3062fe6060f1SDimitry Andric // dominator of the exit blocks.
3063e8d8bef9SDimitry Andric DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3064e8d8bef9SDimitry Andric }
3065e8d8bef9SDimitry Andric
createInductionResumeValue(PHINode * OrigPhi,const InductionDescriptor & II,Value * Step,ArrayRef<BasicBlock * > BypassBlocks,std::pair<BasicBlock *,Value * > AdditionalBypass)3066bdd1243dSDimitry Andric PHINode *InnerLoopVectorizer::createInductionResumeValue(
3067fe013be4SDimitry Andric PHINode *OrigPhi, const InductionDescriptor &II, Value *Step,
3068bdd1243dSDimitry Andric ArrayRef<BasicBlock *> BypassBlocks,
306981ad6265SDimitry Andric std::pair<BasicBlock *, Value *> AdditionalBypass) {
307081ad6265SDimitry Andric Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
307181ad6265SDimitry Andric assert(VectorTripCount && "Expected valid arguments");
3072e8d8bef9SDimitry Andric
3073bdd1243dSDimitry Andric Instruction *OldInduction = Legal->getPrimaryInduction();
3074e8d8bef9SDimitry Andric Value *&EndValue = IVEndValues[OrigPhi];
3075e8d8bef9SDimitry Andric Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3076e8d8bef9SDimitry Andric if (OrigPhi == OldInduction) {
3077e8d8bef9SDimitry Andric // We know what the end value is.
3078e8d8bef9SDimitry Andric EndValue = VectorTripCount;
3079e8d8bef9SDimitry Andric } else {
308081ad6265SDimitry Andric IRBuilder<> B(LoopVectorPreHeader->getTerminator());
3081fe6060f1SDimitry Andric
3082fe6060f1SDimitry Andric // Fast-math-flags propagate from the original induction instruction.
3083fe6060f1SDimitry Andric if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3084fe6060f1SDimitry Andric B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3085fe6060f1SDimitry Andric
3086c9157d92SDimitry Andric EndValue = emitTransformedIndex(B, VectorTripCount, II.getStartValue(),
3087c9157d92SDimitry Andric Step, II.getKind(), II.getInductionBinOp());
3088e8d8bef9SDimitry Andric EndValue->setName("ind.end");
3089e8d8bef9SDimitry Andric
3090e8d8bef9SDimitry Andric // Compute the end value for the additional bypass (if applicable).
3091e8d8bef9SDimitry Andric if (AdditionalBypass.first) {
3092c9157d92SDimitry Andric B.SetInsertPoint(AdditionalBypass.first,
3093c9157d92SDimitry Andric AdditionalBypass.first->getFirstInsertionPt());
3094c9157d92SDimitry Andric EndValueFromAdditionalBypass =
3095c9157d92SDimitry Andric emitTransformedIndex(B, AdditionalBypass.second, II.getStartValue(),
3096c9157d92SDimitry Andric Step, II.getKind(), II.getInductionBinOp());
3097e8d8bef9SDimitry Andric EndValueFromAdditionalBypass->setName("ind.end");
3098e8d8bef9SDimitry Andric }
3099e8d8bef9SDimitry Andric }
310081ad6265SDimitry Andric
310181ad6265SDimitry Andric // Create phi nodes to merge from the backedge-taken check block.
3102bdd1243dSDimitry Andric PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
310381ad6265SDimitry Andric LoopScalarPreHeader->getTerminator());
310481ad6265SDimitry Andric // Copy original phi DL over to the new one.
310581ad6265SDimitry Andric BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
310681ad6265SDimitry Andric
3107e8d8bef9SDimitry Andric // The new PHI merges the original incoming value, in case of a bypass,
3108e8d8bef9SDimitry Andric // or the value at the end of the vectorized loop.
3109e8d8bef9SDimitry Andric BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3110e8d8bef9SDimitry Andric
3111e8d8bef9SDimitry Andric // Fix the scalar body counter (PHI node).
3112e8d8bef9SDimitry Andric // The old induction's phi node in the scalar body needs the truncated
3113e8d8bef9SDimitry Andric // value.
3114bdd1243dSDimitry Andric for (BasicBlock *BB : BypassBlocks)
3115e8d8bef9SDimitry Andric BCResumeVal->addIncoming(II.getStartValue(), BB);
3116e8d8bef9SDimitry Andric
3117e8d8bef9SDimitry Andric if (AdditionalBypass.first)
3118e8d8bef9SDimitry Andric BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3119e8d8bef9SDimitry Andric EndValueFromAdditionalBypass);
3120bdd1243dSDimitry Andric return BCResumeVal;
3121bdd1243dSDimitry Andric }
3122e8d8bef9SDimitry Andric
3123fe013be4SDimitry Andric /// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
3124fe013be4SDimitry Andric /// expansion results.
getExpandedStep(const InductionDescriptor & ID,const SCEV2ValueTy & ExpandedSCEVs)3125fe013be4SDimitry Andric static Value *getExpandedStep(const InductionDescriptor &ID,
3126fe013be4SDimitry Andric const SCEV2ValueTy &ExpandedSCEVs) {
3127fe013be4SDimitry Andric const SCEV *Step = ID.getStep();
3128fe013be4SDimitry Andric if (auto *C = dyn_cast<SCEVConstant>(Step))
3129fe013be4SDimitry Andric return C->getValue();
3130fe013be4SDimitry Andric if (auto *U = dyn_cast<SCEVUnknown>(Step))
3131fe013be4SDimitry Andric return U->getValue();
3132fe013be4SDimitry Andric auto I = ExpandedSCEVs.find(Step);
3133fe013be4SDimitry Andric assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point");
3134fe013be4SDimitry Andric return I->second;
3135fe013be4SDimitry Andric }
3136fe013be4SDimitry Andric
createInductionResumeValues(const SCEV2ValueTy & ExpandedSCEVs,std::pair<BasicBlock *,Value * > AdditionalBypass)3137bdd1243dSDimitry Andric void InnerLoopVectorizer::createInductionResumeValues(
3138fe013be4SDimitry Andric const SCEV2ValueTy &ExpandedSCEVs,
3139bdd1243dSDimitry Andric std::pair<BasicBlock *, Value *> AdditionalBypass) {
3140bdd1243dSDimitry Andric assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3141bdd1243dSDimitry Andric (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3142bdd1243dSDimitry Andric "Inconsistent information about additional bypass.");
3143bdd1243dSDimitry Andric // We are going to resume the execution of the scalar loop.
3144bdd1243dSDimitry Andric // Go over all of the induction variables that we found and fix the
3145bdd1243dSDimitry Andric // PHIs that are left in the scalar version of the loop.
3146bdd1243dSDimitry Andric // The starting values of PHI nodes depend on the counter of the last
3147bdd1243dSDimitry Andric // iteration in the vectorized loop.
3148bdd1243dSDimitry Andric // If we come from a bypass edge then we need to start from the original
3149bdd1243dSDimitry Andric // start value.
3150bdd1243dSDimitry Andric for (const auto &InductionEntry : Legal->getInductionVars()) {
3151bdd1243dSDimitry Andric PHINode *OrigPhi = InductionEntry.first;
3152bdd1243dSDimitry Andric const InductionDescriptor &II = InductionEntry.second;
3153bdd1243dSDimitry Andric PHINode *BCResumeVal = createInductionResumeValue(
3154fe013be4SDimitry Andric OrigPhi, II, getExpandedStep(II, ExpandedSCEVs), LoopBypassBlocks,
3155fe013be4SDimitry Andric AdditionalBypass);
3156e8d8bef9SDimitry Andric OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3157e8d8bef9SDimitry Andric }
3158e8d8bef9SDimitry Andric }
3159e8d8bef9SDimitry Andric
completeLoopSkeleton()3160bdd1243dSDimitry Andric BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() {
3161e8d8bef9SDimitry Andric // The trip counts should be cached by now.
3162fe013be4SDimitry Andric Value *Count = getTripCount();
316381ad6265SDimitry Andric Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3164e8d8bef9SDimitry Andric
3165e8d8bef9SDimitry Andric auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3166e8d8bef9SDimitry Andric
3167e8d8bef9SDimitry Andric // Add a check in the middle block to see if we have completed
3168fe6060f1SDimitry Andric // all of the iterations in the first vector loop. Three cases:
3169fe6060f1SDimitry Andric // 1) If we require a scalar epilogue, there is no conditional branch as
3170fe6060f1SDimitry Andric // we unconditionally branch to the scalar preheader. Do nothing.
3171fe6060f1SDimitry Andric // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3172fe6060f1SDimitry Andric // Thus if tail is to be folded, we know we don't need to run the
3173fe6060f1SDimitry Andric // remainder and we can use the previous value for the condition (true).
3174fe6060f1SDimitry Andric // 3) Otherwise, construct a runtime check.
3175fe013be4SDimitry Andric if (!Cost->requiresScalarEpilogue(VF.isVector()) &&
3176fe013be4SDimitry Andric !Cost->foldTailByMasking()) {
3177e8d8bef9SDimitry Andric // Here we use the same DebugLoc as the scalar loop latch terminator instead
3178e8d8bef9SDimitry Andric // of the corresponding compare because they may have ended up with
3179e8d8bef9SDimitry Andric // different line numbers and we want to avoid awkward line stepping while
3180e8d8bef9SDimitry Andric // debugging. Eg. if the compare has got a line number inside the loop.
3181c9157d92SDimitry Andric // TODO: At the moment, CreateICmpEQ will simplify conditions with constant
3182c9157d92SDimitry Andric // operands. Perform simplification directly on VPlan once the branch is
3183c9157d92SDimitry Andric // modeled there.
3184c9157d92SDimitry Andric IRBuilder<> B(LoopMiddleBlock->getTerminator());
3185c9157d92SDimitry Andric B.SetCurrentDebugLocation(ScalarLatchTerm->getDebugLoc());
3186c9157d92SDimitry Andric Value *CmpN = B.CreateICmpEQ(Count, VectorTripCount, "cmp.n");
3187c9157d92SDimitry Andric BranchInst &BI = *cast<BranchInst>(LoopMiddleBlock->getTerminator());
3188c9157d92SDimitry Andric BI.setCondition(CmpN);
3189c9157d92SDimitry Andric if (hasBranchWeightMD(*ScalarLatchTerm)) {
3190c9157d92SDimitry Andric // Assume that `Count % VectorTripCount` is equally distributed.
3191c9157d92SDimitry Andric unsigned TripCount = UF * VF.getKnownMinValue();
3192c9157d92SDimitry Andric assert(TripCount > 0 && "trip count should not be zero");
3193c9157d92SDimitry Andric const uint32_t Weights[] = {1, TripCount - 1};
3194c9157d92SDimitry Andric setBranchWeights(BI, Weights);
3195c9157d92SDimitry Andric }
3196e8d8bef9SDimitry Andric }
3197e8d8bef9SDimitry Andric
3198e8d8bef9SDimitry Andric #ifdef EXPENSIVE_CHECKS
3199e8d8bef9SDimitry Andric assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3200e8d8bef9SDimitry Andric #endif
3201e8d8bef9SDimitry Andric
3202e8d8bef9SDimitry Andric return LoopVectorPreHeader;
3203e8d8bef9SDimitry Andric }
3204e8d8bef9SDimitry Andric
320504eeddc0SDimitry Andric std::pair<BasicBlock *, Value *>
createVectorizedLoopSkeleton(const SCEV2ValueTy & ExpandedSCEVs)3206fe013be4SDimitry Andric InnerLoopVectorizer::createVectorizedLoopSkeleton(
3207fe013be4SDimitry Andric const SCEV2ValueTy &ExpandedSCEVs) {
32080b57cec5SDimitry Andric /*
32090b57cec5SDimitry Andric In this function we generate a new loop. The new loop will contain
32100b57cec5SDimitry Andric the vectorized instructions while the old loop will continue to run the
32110b57cec5SDimitry Andric scalar remainder.
32120b57cec5SDimitry Andric
3213fe013be4SDimitry Andric [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
3214fe013be4SDimitry Andric / | preheader are expanded here. Eventually all required SCEV
3215fe013be4SDimitry Andric / | expansion should happen here.
32160b57cec5SDimitry Andric / v
32170b57cec5SDimitry Andric | [ ] <-- vector loop bypass (may consist of multiple blocks).
32180b57cec5SDimitry Andric | / |
32190b57cec5SDimitry Andric | / v
32200b57cec5SDimitry Andric || [ ] <-- vector pre header.
32210b57cec5SDimitry Andric |/ |
32220b57cec5SDimitry Andric | v
32230b57cec5SDimitry Andric | [ ] \
322481ad6265SDimitry Andric | [ ]_| <-- vector loop (created during VPlan execution).
32250b57cec5SDimitry Andric | |
32260b57cec5SDimitry Andric | v
3227fe6060f1SDimitry Andric \ -[ ] <--- middle-block.
3228fe6060f1SDimitry Andric \/ |
3229fe6060f1SDimitry Andric /\ v
3230fe6060f1SDimitry Andric | ->[ ] <--- new preheader.
32310b57cec5SDimitry Andric | |
3232fe6060f1SDimitry Andric (opt) v <-- edge from middle to exit iff epilogue is not required.
32330b57cec5SDimitry Andric | [ ] \
3234fe6060f1SDimitry Andric | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue).
32350b57cec5SDimitry Andric \ |
32360b57cec5SDimitry Andric \ v
3237fe6060f1SDimitry Andric >[ ] <-- exit block(s).
32380b57cec5SDimitry Andric ...
32390b57cec5SDimitry Andric */
32400b57cec5SDimitry Andric
3241e8d8bef9SDimitry Andric // Create an empty vector loop, and prepare basic blocks for the runtime
3242e8d8bef9SDimitry Andric // checks.
324381ad6265SDimitry Andric createVectorLoopSkeleton("");
32440b57cec5SDimitry Andric
32450b57cec5SDimitry Andric // Now, compare the new count to zero. If it is zero skip the vector loop and
32460b57cec5SDimitry Andric // jump to the scalar loop. This check also covers the case where the
32470b57cec5SDimitry Andric // backedge-taken count is uint##_max: adding one to it will overflow leading
32480b57cec5SDimitry Andric // to an incorrect trip count of zero. In this (rare) case we will also jump
32490b57cec5SDimitry Andric // to the scalar loop.
325081ad6265SDimitry Andric emitIterationCountCheck(LoopScalarPreHeader);
32510b57cec5SDimitry Andric
32520b57cec5SDimitry Andric // Generate the code to check any assumptions that we've made for SCEV
32530b57cec5SDimitry Andric // expressions.
325481ad6265SDimitry Andric emitSCEVChecks(LoopScalarPreHeader);
32550b57cec5SDimitry Andric
32560b57cec5SDimitry Andric // Generate the code that checks in runtime if arrays overlap. We put the
32570b57cec5SDimitry Andric // checks into a separate block to make the more common case of few elements
32580b57cec5SDimitry Andric // faster.
325981ad6265SDimitry Andric emitMemRuntimeChecks(LoopScalarPreHeader);
32600b57cec5SDimitry Andric
3261e8d8bef9SDimitry Andric // Emit phis for the new starting index of the scalar loop.
3262fe013be4SDimitry Andric createInductionResumeValues(ExpandedSCEVs);
32630b57cec5SDimitry Andric
3264bdd1243dSDimitry Andric return {completeLoopSkeleton(), nullptr};
32650b57cec5SDimitry Andric }
32660b57cec5SDimitry Andric
32670b57cec5SDimitry Andric // Fix up external users of the induction variable. At this point, we are
32680b57cec5SDimitry Andric // in LCSSA form, with all external PHIs that use the IV having one input value,
32690b57cec5SDimitry Andric // coming from the remainder loop. We need those PHIs to also have a correct
32700b57cec5SDimitry Andric // value for the IV when arriving directly from the middle block.
fixupIVUsers(PHINode * OrigPhi,const InductionDescriptor & II,Value * VectorTripCount,Value * EndValue,BasicBlock * MiddleBlock,BasicBlock * VectorHeader,VPlan & Plan,VPTransformState & State)32710b57cec5SDimitry Andric void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
32720b57cec5SDimitry Andric const InductionDescriptor &II,
327381ad6265SDimitry Andric Value *VectorTripCount, Value *EndValue,
327481ad6265SDimitry Andric BasicBlock *MiddleBlock,
3275fe013be4SDimitry Andric BasicBlock *VectorHeader, VPlan &Plan,
3276fe013be4SDimitry Andric VPTransformState &State) {
32770b57cec5SDimitry Andric // There are two kinds of external IV usages - those that use the value
32780b57cec5SDimitry Andric // computed in the last iteration (the PHI) and those that use the penultimate
32790b57cec5SDimitry Andric // value (the value that feeds into the phi from the loop latch).
32800b57cec5SDimitry Andric // We allow both, but they, obviously, have different values.
32810b57cec5SDimitry Andric
3282e8d8bef9SDimitry Andric assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
32830b57cec5SDimitry Andric
32840b57cec5SDimitry Andric DenseMap<Value *, Value *> MissingVals;
32850b57cec5SDimitry Andric
32860b57cec5SDimitry Andric // An external user of the last iteration's value should see the value that
32870b57cec5SDimitry Andric // the remainder loop uses to initialize its own IV.
32880b57cec5SDimitry Andric Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
32890b57cec5SDimitry Andric for (User *U : PostInc->users()) {
32900b57cec5SDimitry Andric Instruction *UI = cast<Instruction>(U);
32910b57cec5SDimitry Andric if (!OrigLoop->contains(UI)) {
32920b57cec5SDimitry Andric assert(isa<PHINode>(UI) && "Expected LCSSA form");
32930b57cec5SDimitry Andric MissingVals[UI] = EndValue;
32940b57cec5SDimitry Andric }
32950b57cec5SDimitry Andric }
32960b57cec5SDimitry Andric
32970b57cec5SDimitry Andric // An external user of the penultimate value need to see EndValue - Step.
32980b57cec5SDimitry Andric // The simplest way to get this is to recompute it from the constituent SCEVs,
32990b57cec5SDimitry Andric // that is Start + (Step * (CRD - 1)).
33000b57cec5SDimitry Andric for (User *U : OrigPhi->users()) {
33010b57cec5SDimitry Andric auto *UI = cast<Instruction>(U);
33020b57cec5SDimitry Andric if (!OrigLoop->contains(UI)) {
33030b57cec5SDimitry Andric assert(isa<PHINode>(UI) && "Expected LCSSA form");
33040b57cec5SDimitry Andric IRBuilder<> B(MiddleBlock->getTerminator());
3305fe6060f1SDimitry Andric
3306fe6060f1SDimitry Andric // Fast-math-flags propagate from the original induction instruction.
3307fe6060f1SDimitry Andric if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3308fe6060f1SDimitry Andric B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3309fe6060f1SDimitry Andric
33100b57cec5SDimitry Andric Value *CountMinusOne = B.CreateSub(
331181ad6265SDimitry Andric VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1));
3312bdd1243dSDimitry Andric CountMinusOne->setName("cmo");
3313fe013be4SDimitry Andric
3314fe013be4SDimitry Andric VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep());
3315fe013be4SDimitry Andric assert(StepVPV && "step must have been expanded during VPlan execution");
3316fe013be4SDimitry Andric Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()
3317fe013be4SDimitry Andric : State.get(StepVPV, {0, 0});
33180eae32dcSDimitry Andric Value *Escape =
3319c9157d92SDimitry Andric emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step,
3320c9157d92SDimitry Andric II.getKind(), II.getInductionBinOp());
33210b57cec5SDimitry Andric Escape->setName("ind.escape");
33220b57cec5SDimitry Andric MissingVals[UI] = Escape;
33230b57cec5SDimitry Andric }
33240b57cec5SDimitry Andric }
33250b57cec5SDimitry Andric
33260b57cec5SDimitry Andric for (auto &I : MissingVals) {
33270b57cec5SDimitry Andric PHINode *PHI = cast<PHINode>(I.first);
33280b57cec5SDimitry Andric // One corner case we have to handle is two IVs "chasing" each-other,
33290b57cec5SDimitry Andric // that is %IV2 = phi [...], [ %IV1, %latch ]
33300b57cec5SDimitry Andric // In this case, if IV1 has an external use, we need to avoid adding both
33310b57cec5SDimitry Andric // "last value of IV1" and "penultimate value of IV2". So, verify that we
33320b57cec5SDimitry Andric // don't already have an incoming value for the middle block.
333381ad6265SDimitry Andric if (PHI->getBasicBlockIndex(MiddleBlock) == -1) {
33340b57cec5SDimitry Andric PHI->addIncoming(I.second, MiddleBlock);
333581ad6265SDimitry Andric Plan.removeLiveOut(PHI);
333681ad6265SDimitry Andric }
33370b57cec5SDimitry Andric }
33380b57cec5SDimitry Andric }
33390b57cec5SDimitry Andric
33400b57cec5SDimitry Andric namespace {
33410b57cec5SDimitry Andric
33420b57cec5SDimitry Andric struct CSEDenseMapInfo {
canHandle__anona898e7630e11::CSEDenseMapInfo33430b57cec5SDimitry Andric static bool canHandle(const Instruction *I) {
33440b57cec5SDimitry Andric return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
33450b57cec5SDimitry Andric isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
33460b57cec5SDimitry Andric }
33470b57cec5SDimitry Andric
getEmptyKey__anona898e7630e11::CSEDenseMapInfo33480b57cec5SDimitry Andric static inline Instruction *getEmptyKey() {
33490b57cec5SDimitry Andric return DenseMapInfo<Instruction *>::getEmptyKey();
33500b57cec5SDimitry Andric }
33510b57cec5SDimitry Andric
getTombstoneKey__anona898e7630e11::CSEDenseMapInfo33520b57cec5SDimitry Andric static inline Instruction *getTombstoneKey() {
33530b57cec5SDimitry Andric return DenseMapInfo<Instruction *>::getTombstoneKey();
33540b57cec5SDimitry Andric }
33550b57cec5SDimitry Andric
getHashValue__anona898e7630e11::CSEDenseMapInfo33560b57cec5SDimitry Andric static unsigned getHashValue(const Instruction *I) {
33570b57cec5SDimitry Andric assert(canHandle(I) && "Unknown instruction!");
33580b57cec5SDimitry Andric return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
33590b57cec5SDimitry Andric I->value_op_end()));
33600b57cec5SDimitry Andric }
33610b57cec5SDimitry Andric
isEqual__anona898e7630e11::CSEDenseMapInfo33620b57cec5SDimitry Andric static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
33630b57cec5SDimitry Andric if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
33640b57cec5SDimitry Andric LHS == getTombstoneKey() || RHS == getTombstoneKey())
33650b57cec5SDimitry Andric return LHS == RHS;
33660b57cec5SDimitry Andric return LHS->isIdenticalTo(RHS);
33670b57cec5SDimitry Andric }
33680b57cec5SDimitry Andric };
33690b57cec5SDimitry Andric
33700b57cec5SDimitry Andric } // end anonymous namespace
33710b57cec5SDimitry Andric
33720b57cec5SDimitry Andric ///Perform cse of induction variable instructions.
cse(BasicBlock * BB)33730b57cec5SDimitry Andric static void cse(BasicBlock *BB) {
33740b57cec5SDimitry Andric // Perform simple cse.
33750b57cec5SDimitry Andric SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3376349cc55cSDimitry Andric for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3377349cc55cSDimitry Andric if (!CSEDenseMapInfo::canHandle(&In))
33780b57cec5SDimitry Andric continue;
33790b57cec5SDimitry Andric
33800b57cec5SDimitry Andric // Check if we can replace this instruction with any of the
33810b57cec5SDimitry Andric // visited instructions.
3382349cc55cSDimitry Andric if (Instruction *V = CSEMap.lookup(&In)) {
3383349cc55cSDimitry Andric In.replaceAllUsesWith(V);
3384349cc55cSDimitry Andric In.eraseFromParent();
33850b57cec5SDimitry Andric continue;
33860b57cec5SDimitry Andric }
33870b57cec5SDimitry Andric
3388349cc55cSDimitry Andric CSEMap[&In] = &In;
33890b57cec5SDimitry Andric }
33900b57cec5SDimitry Andric }
33910b57cec5SDimitry Andric
3392c9157d92SDimitry Andric InstructionCost
getVectorCallCost(CallInst * CI,ElementCount VF) const3393c9157d92SDimitry Andric LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3394c9157d92SDimitry Andric ElementCount VF) const {
3395c9157d92SDimitry Andric // We only need to calculate a cost if the VF is scalar; for actual vectors
3396c9157d92SDimitry Andric // we should already have a pre-calculated cost at each VF.
3397c9157d92SDimitry Andric if (!VF.isScalar())
3398c9157d92SDimitry Andric return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost;
33990b57cec5SDimitry Andric
3400bdd1243dSDimitry Andric TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
3401c9157d92SDimitry Andric Type *RetTy = CI->getType();
3402c9157d92SDimitry Andric if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
3403c9157d92SDimitry Andric if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind))
3404c9157d92SDimitry Andric return *RedCost;
3405c9157d92SDimitry Andric
3406c9157d92SDimitry Andric SmallVector<Type *, 4> Tys;
3407c9157d92SDimitry Andric for (auto &ArgOp : CI->args())
3408c9157d92SDimitry Andric Tys.push_back(ArgOp->getType());
3409c9157d92SDimitry Andric
3410e8d8bef9SDimitry Andric InstructionCost ScalarCallCost =
3411c9157d92SDimitry Andric TTI.getCallInstrCost(CI->getCalledFunction(), RetTy, Tys, CostKind);
3412c9157d92SDimitry Andric
3413c9157d92SDimitry Andric // If this is an intrinsic we may have a lower cost for it.
3414c9157d92SDimitry Andric if (getVectorIntrinsicIDForCall(CI, TLI)) {
3415c9157d92SDimitry Andric InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
3416c9157d92SDimitry Andric return std::min(ScalarCallCost, IntrinsicCost);
3417c9157d92SDimitry Andric }
34180b57cec5SDimitry Andric return ScalarCallCost;
34190b57cec5SDimitry Andric }
34200b57cec5SDimitry Andric
MaybeVectorizeType(Type * Elt,ElementCount VF)3421fe6060f1SDimitry Andric static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3422fe6060f1SDimitry Andric if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3423fe6060f1SDimitry Andric return Elt;
3424fe6060f1SDimitry Andric return VectorType::get(Elt, VF);
3425fe6060f1SDimitry Andric }
3426fe6060f1SDimitry Andric
3427e8d8bef9SDimitry Andric InstructionCost
getVectorIntrinsicCost(CallInst * CI,ElementCount VF) const3428e8d8bef9SDimitry Andric LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3429fe6060f1SDimitry Andric ElementCount VF) const {
34300b57cec5SDimitry Andric Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
34310b57cec5SDimitry Andric assert(ID && "Expected intrinsic call!");
3432fe6060f1SDimitry Andric Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3433fe6060f1SDimitry Andric FastMathFlags FMF;
3434fe6060f1SDimitry Andric if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3435fe6060f1SDimitry Andric FMF = FPMO->getFastMathFlags();
34360b57cec5SDimitry Andric
3437349cc55cSDimitry Andric SmallVector<const Value *> Arguments(CI->args());
3438fe6060f1SDimitry Andric FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3439fe6060f1SDimitry Andric SmallVector<Type *> ParamTys;
3440fe6060f1SDimitry Andric std::transform(FTy->param_begin(), FTy->param_end(),
3441fe6060f1SDimitry Andric std::back_inserter(ParamTys),
3442fe6060f1SDimitry Andric [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3443fe6060f1SDimitry Andric
3444fe6060f1SDimitry Andric IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3445fe6060f1SDimitry Andric dyn_cast<IntrinsicInst>(CI));
34465ffd83dbSDimitry Andric return TTI.getIntrinsicInstrCost(CostAttrs,
34475ffd83dbSDimitry Andric TargetTransformInfo::TCK_RecipThroughput);
34480b57cec5SDimitry Andric }
34490b57cec5SDimitry Andric
smallestIntegerVectorType(Type * T1,Type * T2)34500b57cec5SDimitry Andric static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
34515ffd83dbSDimitry Andric auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
34525ffd83dbSDimitry Andric auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
34530b57cec5SDimitry Andric return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
34540b57cec5SDimitry Andric }
34555ffd83dbSDimitry Andric
largestIntegerVectorType(Type * T1,Type * T2)34560b57cec5SDimitry Andric static Type *largestIntegerVectorType(Type *T1, Type *T2) {
34575ffd83dbSDimitry Andric auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
34585ffd83dbSDimitry Andric auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
34590b57cec5SDimitry Andric return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
34600b57cec5SDimitry Andric }
34610b57cec5SDimitry Andric
fixVectorizedLoop(VPTransformState & State,VPlan & Plan)346281ad6265SDimitry Andric void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
346381ad6265SDimitry Andric VPlan &Plan) {
34640b57cec5SDimitry Andric // Fix widened non-induction PHIs by setting up the PHI operands.
346581ad6265SDimitry Andric if (EnableVPlanNativePath)
346681ad6265SDimitry Andric fixNonInductionPHIs(Plan, State);
34670b57cec5SDimitry Andric
34680b57cec5SDimitry Andric // At this point every instruction in the original loop is widened to a
34690b57cec5SDimitry Andric // vector form. Now we need to fix the recurrences in the loop. These PHI
34700b57cec5SDimitry Andric // nodes are currently empty because we did not want to introduce cycles.
3471cdc20ff6SDimitry Andric // This is the second stage of vectorizing recurrences. Note that fixing
3472cdc20ff6SDimitry Andric // reduction phis are already modeled in VPlan.
3473cdc20ff6SDimitry Andric // TODO: Also model fixing fixed-order recurrence phis in VPlan.
3474cdc20ff6SDimitry Andric VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
3475cdc20ff6SDimitry Andric VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock();
3476cdc20ff6SDimitry Andric for (VPRecipeBase &R : HeaderVPBB->phis()) {
3477cdc20ff6SDimitry Andric if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
3478cdc20ff6SDimitry Andric fixFixedOrderRecurrence(FOR, State);
3479cdc20ff6SDimitry Andric }
34800b57cec5SDimitry Andric
3481480093f4SDimitry Andric // Forget the original basic block.
3482480093f4SDimitry Andric PSE.getSE()->forgetLoop(OrigLoop);
3483c9157d92SDimitry Andric PSE.getSE()->forgetBlockAndLoopDispositions();
34840b57cec5SDimitry Andric
3485fe013be4SDimitry Andric // After vectorization, the exit blocks of the original loop will have
3486fe013be4SDimitry Andric // additional predecessors. Invalidate SCEVs for the exit phis in case SE
3487fe013be4SDimitry Andric // looked through single-entry phis.
3488fe013be4SDimitry Andric SmallVector<BasicBlock *> ExitBlocks;
3489fe013be4SDimitry Andric OrigLoop->getExitBlocks(ExitBlocks);
3490fe013be4SDimitry Andric for (BasicBlock *Exit : ExitBlocks)
3491fe013be4SDimitry Andric for (PHINode &PN : Exit->phis())
3492c9157d92SDimitry Andric PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN);
3493fe013be4SDimitry Andric
3494cdc20ff6SDimitry Andric VPBasicBlock *LatchVPBB = VectorRegion->getExitingBasicBlock();
349581ad6265SDimitry Andric Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
3496fe013be4SDimitry Andric if (Cost->requiresScalarEpilogue(VF.isVector())) {
349781ad6265SDimitry Andric // No edge from the middle block to the unique exit block has been inserted
349881ad6265SDimitry Andric // and there is nothing to fix from vector loop; phis should have incoming
349981ad6265SDimitry Andric // from scalar loop only.
350081ad6265SDimitry Andric } else {
3501fe013be4SDimitry Andric // TODO: Check VPLiveOuts to see if IV users need fixing instead of checking
3502fe013be4SDimitry Andric // the cost model.
3503fe013be4SDimitry Andric
3504fe6060f1SDimitry Andric // If we inserted an edge from the middle block to the unique exit block,
3505fe6060f1SDimitry Andric // update uses outside the loop (phis) to account for the newly inserted
3506fe6060f1SDimitry Andric // edge.
350781ad6265SDimitry Andric
35080b57cec5SDimitry Andric // Fix-up external users of the induction variables.
3509bdd1243dSDimitry Andric for (const auto &Entry : Legal->getInductionVars())
35100b57cec5SDimitry Andric fixupIVUsers(Entry.first, Entry.second,
351181ad6265SDimitry Andric getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()),
351281ad6265SDimitry Andric IVEndValues[Entry.first], LoopMiddleBlock,
3513fe013be4SDimitry Andric VectorLoop->getHeader(), Plan, State);
3514fe6060f1SDimitry Andric }
3515fe6060f1SDimitry Andric
351681ad6265SDimitry Andric // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated
351781ad6265SDimitry Andric // in the exit block, so update the builder.
3518c9157d92SDimitry Andric State.Builder.SetInsertPoint(State.CFG.ExitBB,
3519c9157d92SDimitry Andric State.CFG.ExitBB->getFirstNonPHIIt());
3520bdd1243dSDimitry Andric for (const auto &KV : Plan.getLiveOuts())
352181ad6265SDimitry Andric KV.second->fixPhi(Plan, State);
352281ad6265SDimitry Andric
35230b57cec5SDimitry Andric for (Instruction *PI : PredicatedInstructions)
35240b57cec5SDimitry Andric sinkScalarOperands(&*PI);
35250b57cec5SDimitry Andric
35260b57cec5SDimitry Andric // Remove redundant induction instructions.
352781ad6265SDimitry Andric cse(VectorLoop->getHeader());
35285ffd83dbSDimitry Andric
35295ffd83dbSDimitry Andric // Set/update profile weights for the vector and remainder loops as original
35305ffd83dbSDimitry Andric // loop iterations are now distributed among them. Note that original loop
35315ffd83dbSDimitry Andric // represented by LoopScalarBody becomes remainder loop after vectorization.
35325ffd83dbSDimitry Andric //
35335ffd83dbSDimitry Andric // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
35345ffd83dbSDimitry Andric // end up getting slightly roughened result but that should be OK since
35355ffd83dbSDimitry Andric // profile is not inherently precise anyway. Note also possible bypass of
35365ffd83dbSDimitry Andric // vector code caused by legality checks is ignored, assigning all the weight
35375ffd83dbSDimitry Andric // to the vector loop, optimistically.
3538e8d8bef9SDimitry Andric //
3539e8d8bef9SDimitry Andric // For scalable vectorization we can't know at compile time how many iterations
3540e8d8bef9SDimitry Andric // of the loop are handled in one vector iteration, so instead assume a pessimistic
3541e8d8bef9SDimitry Andric // vscale of '1'.
354281ad6265SDimitry Andric setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop,
354381ad6265SDimitry Andric LI->getLoopFor(LoopScalarBody),
354481ad6265SDimitry Andric VF.getKnownMinValue() * UF);
35450b57cec5SDimitry Andric }
35460b57cec5SDimitry Andric
fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe * PhiR,VPTransformState & State)3547bdd1243dSDimitry Andric void InnerLoopVectorizer::fixFixedOrderRecurrence(
354804eeddc0SDimitry Andric VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {
35490b57cec5SDimitry Andric // This is the second phase of vectorizing first-order recurrences. An
35500b57cec5SDimitry Andric // overview of the transformation is described below. Suppose we have the
35510b57cec5SDimitry Andric // following loop.
35520b57cec5SDimitry Andric //
35530b57cec5SDimitry Andric // for (int i = 0; i < n; ++i)
35540b57cec5SDimitry Andric // b[i] = a[i] - a[i - 1];
35550b57cec5SDimitry Andric //
35560b57cec5SDimitry Andric // There is a first-order recurrence on "a". For this loop, the shorthand
35570b57cec5SDimitry Andric // scalar IR looks like:
35580b57cec5SDimitry Andric //
35590b57cec5SDimitry Andric // scalar.ph:
35600b57cec5SDimitry Andric // s_init = a[-1]
35610b57cec5SDimitry Andric // br scalar.body
35620b57cec5SDimitry Andric //
35630b57cec5SDimitry Andric // scalar.body:
35640b57cec5SDimitry Andric // i = phi [0, scalar.ph], [i+1, scalar.body]
35650b57cec5SDimitry Andric // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
35660b57cec5SDimitry Andric // s2 = a[i]
35670b57cec5SDimitry Andric // b[i] = s2 - s1
35680b57cec5SDimitry Andric // br cond, scalar.body, ...
35690b57cec5SDimitry Andric //
35700b57cec5SDimitry Andric // In this example, s1 is a recurrence because it's value depends on the
35710b57cec5SDimitry Andric // previous iteration. In the first phase of vectorization, we created a
3572fe6060f1SDimitry Andric // vector phi v1 for s1. We now complete the vectorization and produce the
35730b57cec5SDimitry Andric // shorthand vector IR shown below (for VF = 4, UF = 1).
35740b57cec5SDimitry Andric //
35750b57cec5SDimitry Andric // vector.ph:
35760b57cec5SDimitry Andric // v_init = vector(..., ..., ..., a[-1])
35770b57cec5SDimitry Andric // br vector.body
35780b57cec5SDimitry Andric //
35790b57cec5SDimitry Andric // vector.body
35800b57cec5SDimitry Andric // i = phi [0, vector.ph], [i+4, vector.body]
35810b57cec5SDimitry Andric // v1 = phi [v_init, vector.ph], [v2, vector.body]
35820b57cec5SDimitry Andric // v2 = a[i, i+1, i+2, i+3];
35830b57cec5SDimitry Andric // v3 = vector(v1(3), v2(0, 1, 2))
35840b57cec5SDimitry Andric // b[i, i+1, i+2, i+3] = v2 - v3
35850b57cec5SDimitry Andric // br cond, vector.body, middle.block
35860b57cec5SDimitry Andric //
35870b57cec5SDimitry Andric // middle.block:
35880b57cec5SDimitry Andric // x = v2(3)
35890b57cec5SDimitry Andric // br scalar.ph
35900b57cec5SDimitry Andric //
35910b57cec5SDimitry Andric // scalar.ph:
35920b57cec5SDimitry Andric // s_init = phi [x, middle.block], [a[-1], otherwise]
35930b57cec5SDimitry Andric // br scalar.body
35940b57cec5SDimitry Andric //
35950b57cec5SDimitry Andric // After execution completes the vector loop, we extract the next value of
35960b57cec5SDimitry Andric // the recurrence (x) to use as the initial value in the scalar loop.
35970b57cec5SDimitry Andric
35980b57cec5SDimitry Andric // Extract the last vector element in the middle block. This will be the
35990b57cec5SDimitry Andric // initial value for the recurrence when jumping to the scalar loop.
3600349cc55cSDimitry Andric VPValue *PreviousDef = PhiR->getBackedgeValue();
3601349cc55cSDimitry Andric Value *Incoming = State.get(PreviousDef, UF - 1);
36020b57cec5SDimitry Andric auto *ExtractForScalar = Incoming;
3603349cc55cSDimitry Andric auto *IdxTy = Builder.getInt32Ty();
3604fe013be4SDimitry Andric Value *RuntimeVF = nullptr;
3605e8d8bef9SDimitry Andric if (VF.isVector()) {
3606fe6060f1SDimitry Andric auto *One = ConstantInt::get(IdxTy, 1);
36070b57cec5SDimitry Andric Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3608fe013be4SDimitry Andric RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3609fe6060f1SDimitry Andric auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
3610fe013be4SDimitry Andric ExtractForScalar =
3611fe013be4SDimitry Andric Builder.CreateExtractElement(Incoming, LastIdx, "vector.recur.extract");
36120b57cec5SDimitry Andric }
3613fe013be4SDimitry Andric
3614fe013be4SDimitry Andric auto RecurSplice = cast<VPInstruction>(*PhiR->user_begin());
3615fe013be4SDimitry Andric assert(PhiR->getNumUsers() == 1 &&
3616fe013be4SDimitry Andric RecurSplice->getOpcode() ==
3617fe013be4SDimitry Andric VPInstruction::FirstOrderRecurrenceSplice &&
3618fe013be4SDimitry Andric "recurrence phi must have a single user: FirstOrderRecurrenceSplice");
3619fe013be4SDimitry Andric SmallVector<VPLiveOut *> LiveOuts;
3620fe013be4SDimitry Andric for (VPUser *U : RecurSplice->users())
3621fe013be4SDimitry Andric if (auto *LiveOut = dyn_cast<VPLiveOut>(U))
3622fe013be4SDimitry Andric LiveOuts.push_back(LiveOut);
3623fe013be4SDimitry Andric
3624fe013be4SDimitry Andric if (!LiveOuts.empty()) {
36250b57cec5SDimitry Andric // Extract the second last element in the middle block if the
36260b57cec5SDimitry Andric // Phi is used outside the loop. We need to extract the phi itself
36270b57cec5SDimitry Andric // and not the last element (the phi update in the current iteration). This
3628fe013be4SDimitry Andric // will be the value when jumping to the exit block from the
3629fe013be4SDimitry Andric // LoopMiddleBlock, when the scalar loop is not run at all.
36300b57cec5SDimitry Andric Value *ExtractForPhiUsedOutsideLoop = nullptr;
3631fe6060f1SDimitry Andric if (VF.isVector()) {
3632fe6060f1SDimitry Andric auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
36330b57cec5SDimitry Andric ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3634fe6060f1SDimitry Andric Incoming, Idx, "vector.recur.extract.for.phi");
3635fe013be4SDimitry Andric } else {
3636fe013be4SDimitry Andric assert(UF > 1 && "VF and UF cannot both be 1");
36370b57cec5SDimitry Andric // When loop is unrolled without vectorizing, initialize
3638fe013be4SDimitry Andric // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled
3639fe013be4SDimitry Andric // value of `Incoming`. This is analogous to the vectorized case above:
3640fe013be4SDimitry Andric // extracting the second last element when VF > 1.
3641fe6060f1SDimitry Andric ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
3642fe013be4SDimitry Andric }
3643fe013be4SDimitry Andric
3644fe013be4SDimitry Andric for (VPLiveOut *LiveOut : LiveOuts) {
3645fe013be4SDimitry Andric assert(!Cost->requiresScalarEpilogue(VF.isVector()));
3646fe013be4SDimitry Andric PHINode *LCSSAPhi = LiveOut->getPhi();
3647fe013be4SDimitry Andric LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3648fe013be4SDimitry Andric State.Plan->removeLiveOut(LCSSAPhi);
3649fe013be4SDimitry Andric }
3650fe013be4SDimitry Andric }
36510b57cec5SDimitry Andric
36520b57cec5SDimitry Andric // Fix the initial value of the original recurrence in the scalar loop.
3653c9157d92SDimitry Andric Builder.SetInsertPoint(LoopScalarPreHeader, LoopScalarPreHeader->begin());
3654fe6060f1SDimitry Andric PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
36550b57cec5SDimitry Andric auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3656fe6060f1SDimitry Andric auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
36570b57cec5SDimitry Andric for (auto *BB : predecessors(LoopScalarPreHeader)) {
36580b57cec5SDimitry Andric auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
36590b57cec5SDimitry Andric Start->addIncoming(Incoming, BB);
36600b57cec5SDimitry Andric }
36610b57cec5SDimitry Andric
36620b57cec5SDimitry Andric Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
36630b57cec5SDimitry Andric Phi->setName("scalar.recur");
36640b57cec5SDimitry Andric }
36650b57cec5SDimitry Andric
sinkScalarOperands(Instruction * PredInst)36660b57cec5SDimitry Andric void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
36670b57cec5SDimitry Andric // The basic block and loop containing the predicated instruction.
36680b57cec5SDimitry Andric auto *PredBB = PredInst->getParent();
36690b57cec5SDimitry Andric auto *VectorLoop = LI->getLoopFor(PredBB);
36700b57cec5SDimitry Andric
36710b57cec5SDimitry Andric // Initialize a worklist with the operands of the predicated instruction.
36720b57cec5SDimitry Andric SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
36730b57cec5SDimitry Andric
36740b57cec5SDimitry Andric // Holds instructions that we need to analyze again. An instruction may be
36750b57cec5SDimitry Andric // reanalyzed if we don't yet know if we can sink it or not.
36760b57cec5SDimitry Andric SmallVector<Instruction *, 8> InstsToReanalyze;
36770b57cec5SDimitry Andric
36780b57cec5SDimitry Andric // Returns true if a given use occurs in the predicated block. Phi nodes use
36790b57cec5SDimitry Andric // their operands in their corresponding predecessor blocks.
36800b57cec5SDimitry Andric auto isBlockOfUsePredicated = [&](Use &U) -> bool {
36810b57cec5SDimitry Andric auto *I = cast<Instruction>(U.getUser());
36820b57cec5SDimitry Andric BasicBlock *BB = I->getParent();
36830b57cec5SDimitry Andric if (auto *Phi = dyn_cast<PHINode>(I))
36840b57cec5SDimitry Andric BB = Phi->getIncomingBlock(
36850b57cec5SDimitry Andric PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
36860b57cec5SDimitry Andric return BB == PredBB;
36870b57cec5SDimitry Andric };
36880b57cec5SDimitry Andric
36890b57cec5SDimitry Andric // Iteratively sink the scalarized operands of the predicated instruction
36900b57cec5SDimitry Andric // into the block we created for it. When an instruction is sunk, it's
36910b57cec5SDimitry Andric // operands are then added to the worklist. The algorithm ends after one pass
36920b57cec5SDimitry Andric // through the worklist doesn't sink a single instruction.
36930b57cec5SDimitry Andric bool Changed;
36940b57cec5SDimitry Andric do {
36950b57cec5SDimitry Andric // Add the instructions that need to be reanalyzed to the worklist, and
36960b57cec5SDimitry Andric // reset the changed indicator.
36970b57cec5SDimitry Andric Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
36980b57cec5SDimitry Andric InstsToReanalyze.clear();
36990b57cec5SDimitry Andric Changed = false;
37000b57cec5SDimitry Andric
37010b57cec5SDimitry Andric while (!Worklist.empty()) {
37020b57cec5SDimitry Andric auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
37030b57cec5SDimitry Andric
3704fe6060f1SDimitry Andric // We can't sink an instruction if it is a phi node, is not in the loop,
3705fe013be4SDimitry Andric // may have side effects or may read from memory.
3706fe013be4SDimitry Andric // TODO Could dor more granular checking to allow sinking a load past non-store instructions.
3707fe6060f1SDimitry Andric if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
3708fe013be4SDimitry Andric I->mayHaveSideEffects() || I->mayReadFromMemory())
37090b57cec5SDimitry Andric continue;
37100b57cec5SDimitry Andric
3711fe6060f1SDimitry Andric // If the instruction is already in PredBB, check if we can sink its
3712fe6060f1SDimitry Andric // operands. In that case, VPlan's sinkScalarOperands() succeeded in
3713fe6060f1SDimitry Andric // sinking the scalar instruction I, hence it appears in PredBB; but it
3714fe6060f1SDimitry Andric // may have failed to sink I's operands (recursively), which we try
3715fe6060f1SDimitry Andric // (again) here.
3716fe6060f1SDimitry Andric if (I->getParent() == PredBB) {
3717fe6060f1SDimitry Andric Worklist.insert(I->op_begin(), I->op_end());
3718fe6060f1SDimitry Andric continue;
3719fe6060f1SDimitry Andric }
3720fe6060f1SDimitry Andric
37210b57cec5SDimitry Andric // It's legal to sink the instruction if all its uses occur in the
37220b57cec5SDimitry Andric // predicated block. Otherwise, there's nothing to do yet, and we may
37230b57cec5SDimitry Andric // need to reanalyze the instruction.
37240b57cec5SDimitry Andric if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
37250b57cec5SDimitry Andric InstsToReanalyze.push_back(I);
37260b57cec5SDimitry Andric continue;
37270b57cec5SDimitry Andric }
37280b57cec5SDimitry Andric
37290b57cec5SDimitry Andric // Move the instruction to the beginning of the predicated block, and add
37300b57cec5SDimitry Andric // it's operands to the worklist.
37310b57cec5SDimitry Andric I->moveBefore(&*PredBB->getFirstInsertionPt());
37320b57cec5SDimitry Andric Worklist.insert(I->op_begin(), I->op_end());
37330b57cec5SDimitry Andric
37340b57cec5SDimitry Andric // The sinking may have enabled other instructions to be sunk, so we will
37350b57cec5SDimitry Andric // need to iterate.
37360b57cec5SDimitry Andric Changed = true;
37370b57cec5SDimitry Andric }
37380b57cec5SDimitry Andric } while (Changed);
37390b57cec5SDimitry Andric }
37400b57cec5SDimitry Andric
fixNonInductionPHIs(VPlan & Plan,VPTransformState & State)374181ad6265SDimitry Andric void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan,
374281ad6265SDimitry Andric VPTransformState &State) {
3743bdd1243dSDimitry Andric auto Iter = vp_depth_first_deep(Plan.getEntry());
374481ad6265SDimitry Andric for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
374581ad6265SDimitry Andric for (VPRecipeBase &P : VPBB->phis()) {
374681ad6265SDimitry Andric VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
374781ad6265SDimitry Andric if (!VPPhi)
374881ad6265SDimitry Andric continue;
3749fe6060f1SDimitry Andric PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
3750fe6060f1SDimitry Andric // Make sure the builder has a valid insert point.
37510b57cec5SDimitry Andric Builder.SetInsertPoint(NewPhi);
3752fe6060f1SDimitry Andric for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
3753fe6060f1SDimitry Andric VPValue *Inc = VPPhi->getIncomingValue(i);
3754fe6060f1SDimitry Andric VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
3755fe6060f1SDimitry Andric NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
37560b57cec5SDimitry Andric }
37570b57cec5SDimitry Andric }
37580b57cec5SDimitry Andric }
375981ad6265SDimitry Andric }
37600b57cec5SDimitry Andric
useOrderedReductions(const RecurrenceDescriptor & RdxDesc)37610eae32dcSDimitry Andric bool InnerLoopVectorizer::useOrderedReductions(
37620eae32dcSDimitry Andric const RecurrenceDescriptor &RdxDesc) {
3763fe6060f1SDimitry Andric return Cost->useOrderedReductions(RdxDesc);
3764fe6060f1SDimitry Andric }
3765fe6060f1SDimitry Andric
collectLoopScalars(ElementCount VF)3766e8d8bef9SDimitry Andric void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
37670b57cec5SDimitry Andric // We should not collect Scalars more than once per VF. Right now, this
37680b57cec5SDimitry Andric // function is called from collectUniformsAndScalars(), which already does
37690b57cec5SDimitry Andric // this check. Collecting Scalars for VF=1 does not make any sense.
3770fe013be4SDimitry Andric assert(VF.isVector() && !Scalars.contains(VF) &&
37710b57cec5SDimitry Andric "This function should not be visited twice for the same VF");
37720b57cec5SDimitry Andric
377381ad6265SDimitry Andric // This avoids any chances of creating a REPLICATE recipe during planning
377481ad6265SDimitry Andric // since that would result in generation of scalarized code during execution,
377581ad6265SDimitry Andric // which is not supported for scalable vectors.
377681ad6265SDimitry Andric if (VF.isScalable()) {
377781ad6265SDimitry Andric Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
377881ad6265SDimitry Andric return;
377981ad6265SDimitry Andric }
378081ad6265SDimitry Andric
37810b57cec5SDimitry Andric SmallSetVector<Instruction *, 8> Worklist;
37820b57cec5SDimitry Andric
37830b57cec5SDimitry Andric // These sets are used to seed the analysis with pointers used by memory
37840b57cec5SDimitry Andric // accesses that will remain scalar.
37850b57cec5SDimitry Andric SmallSetVector<Instruction *, 8> ScalarPtrs;
37860b57cec5SDimitry Andric SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
3787e8d8bef9SDimitry Andric auto *Latch = TheLoop->getLoopLatch();
37880b57cec5SDimitry Andric
37890b57cec5SDimitry Andric // A helper that returns true if the use of Ptr by MemAccess will be scalar.
37900b57cec5SDimitry Andric // The pointer operands of loads and stores will be scalar as long as the
37910b57cec5SDimitry Andric // memory access is not a gather or scatter operation. The value operand of a
37920b57cec5SDimitry Andric // store will remain scalar if the store is scalarized.
37930b57cec5SDimitry Andric auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
37940b57cec5SDimitry Andric InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
37950b57cec5SDimitry Andric assert(WideningDecision != CM_Unknown &&
37960b57cec5SDimitry Andric "Widening decision should be ready at this moment");
37970b57cec5SDimitry Andric if (auto *Store = dyn_cast<StoreInst>(MemAccess))
37980b57cec5SDimitry Andric if (Ptr == Store->getValueOperand())
37990b57cec5SDimitry Andric return WideningDecision == CM_Scalarize;
38000b57cec5SDimitry Andric assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
38010b57cec5SDimitry Andric "Ptr is neither a value or pointer operand");
38020b57cec5SDimitry Andric return WideningDecision != CM_GatherScatter;
38030b57cec5SDimitry Andric };
38040b57cec5SDimitry Andric
38050b57cec5SDimitry Andric // A helper that returns true if the given value is a bitcast or
38060b57cec5SDimitry Andric // getelementptr instruction contained in the loop.
38070b57cec5SDimitry Andric auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
38080b57cec5SDimitry Andric return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
38090b57cec5SDimitry Andric isa<GetElementPtrInst>(V)) &&
38100b57cec5SDimitry Andric !TheLoop->isLoopInvariant(V);
38110b57cec5SDimitry Andric };
38120b57cec5SDimitry Andric
38134824e7fdSDimitry Andric // A helper that evaluates a memory access's use of a pointer. If the use will
38144824e7fdSDimitry Andric // be a scalar use and the pointer is only used by memory accesses, we place
38154824e7fdSDimitry Andric // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
38164824e7fdSDimitry Andric // PossibleNonScalarPtrs.
38170b57cec5SDimitry Andric auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
38180b57cec5SDimitry Andric // We only care about bitcast and getelementptr instructions contained in
38190b57cec5SDimitry Andric // the loop.
38200b57cec5SDimitry Andric if (!isLoopVaryingBitCastOrGEP(Ptr))
38210b57cec5SDimitry Andric return;
38220b57cec5SDimitry Andric
38230b57cec5SDimitry Andric // If the pointer has already been identified as scalar (e.g., if it was
38240b57cec5SDimitry Andric // also identified as uniform), there's nothing to do.
38250b57cec5SDimitry Andric auto *I = cast<Instruction>(Ptr);
38260b57cec5SDimitry Andric if (Worklist.count(I))
38270b57cec5SDimitry Andric return;
38280b57cec5SDimitry Andric
3829349cc55cSDimitry Andric // If the use of the pointer will be a scalar use, and all users of the
3830349cc55cSDimitry Andric // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
3831349cc55cSDimitry Andric // place the pointer in PossibleNonScalarPtrs.
3832349cc55cSDimitry Andric if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
3833349cc55cSDimitry Andric return isa<LoadInst>(U) || isa<StoreInst>(U);
38340b57cec5SDimitry Andric }))
38350b57cec5SDimitry Andric ScalarPtrs.insert(I);
38360b57cec5SDimitry Andric else
38370b57cec5SDimitry Andric PossibleNonScalarPtrs.insert(I);
38380b57cec5SDimitry Andric };
38390b57cec5SDimitry Andric
38400b57cec5SDimitry Andric // We seed the scalars analysis with three classes of instructions: (1)
3841e8d8bef9SDimitry Andric // instructions marked uniform-after-vectorization and (2) bitcast,
3842e8d8bef9SDimitry Andric // getelementptr and (pointer) phi instructions used by memory accesses
3843e8d8bef9SDimitry Andric // requiring a scalar use.
38440b57cec5SDimitry Andric //
38450b57cec5SDimitry Andric // (1) Add to the worklist all instructions that have been identified as
38460b57cec5SDimitry Andric // uniform-after-vectorization.
38470b57cec5SDimitry Andric Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
38480b57cec5SDimitry Andric
38490b57cec5SDimitry Andric // (2) Add to the worklist all bitcast and getelementptr instructions used by
38500b57cec5SDimitry Andric // memory accesses requiring a scalar use. The pointer operands of loads and
38510b57cec5SDimitry Andric // stores will be scalar as long as the memory accesses is not a gather or
38520b57cec5SDimitry Andric // scatter operation. The value operand of a store will remain scalar if the
38530b57cec5SDimitry Andric // store is scalarized.
38540b57cec5SDimitry Andric for (auto *BB : TheLoop->blocks())
38550b57cec5SDimitry Andric for (auto &I : *BB) {
38560b57cec5SDimitry Andric if (auto *Load = dyn_cast<LoadInst>(&I)) {
38570b57cec5SDimitry Andric evaluatePtrUse(Load, Load->getPointerOperand());
38580b57cec5SDimitry Andric } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
38590b57cec5SDimitry Andric evaluatePtrUse(Store, Store->getPointerOperand());
38600b57cec5SDimitry Andric evaluatePtrUse(Store, Store->getValueOperand());
38610b57cec5SDimitry Andric }
38620b57cec5SDimitry Andric }
38630b57cec5SDimitry Andric for (auto *I : ScalarPtrs)
38645ffd83dbSDimitry Andric if (!PossibleNonScalarPtrs.count(I)) {
38650b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
38660b57cec5SDimitry Andric Worklist.insert(I);
38670b57cec5SDimitry Andric }
38680b57cec5SDimitry Andric
38690b57cec5SDimitry Andric // Insert the forced scalars.
387081ad6265SDimitry Andric // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
38710b57cec5SDimitry Andric // induction variable when the PHI user is scalarized.
38720b57cec5SDimitry Andric auto ForcedScalar = ForcedScalars.find(VF);
38730b57cec5SDimitry Andric if (ForcedScalar != ForcedScalars.end())
3874bdd1243dSDimitry Andric for (auto *I : ForcedScalar->second) {
3875bdd1243dSDimitry Andric LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
38760b57cec5SDimitry Andric Worklist.insert(I);
3877bdd1243dSDimitry Andric }
38780b57cec5SDimitry Andric
38790b57cec5SDimitry Andric // Expand the worklist by looking through any bitcasts and getelementptr
38800b57cec5SDimitry Andric // instructions we've already identified as scalar. This is similar to the
38810b57cec5SDimitry Andric // expansion step in collectLoopUniforms(); however, here we're only
38820b57cec5SDimitry Andric // expanding to include additional bitcasts and getelementptr instructions.
38830b57cec5SDimitry Andric unsigned Idx = 0;
38840b57cec5SDimitry Andric while (Idx != Worklist.size()) {
38850b57cec5SDimitry Andric Instruction *Dst = Worklist[Idx++];
38860b57cec5SDimitry Andric if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
38870b57cec5SDimitry Andric continue;
38880b57cec5SDimitry Andric auto *Src = cast<Instruction>(Dst->getOperand(0));
38890b57cec5SDimitry Andric if (llvm::all_of(Src->users(), [&](User *U) -> bool {
38900b57cec5SDimitry Andric auto *J = cast<Instruction>(U);
38910b57cec5SDimitry Andric return !TheLoop->contains(J) || Worklist.count(J) ||
38920b57cec5SDimitry Andric ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
38930b57cec5SDimitry Andric isScalarUse(J, Src));
38940b57cec5SDimitry Andric })) {
38950b57cec5SDimitry Andric Worklist.insert(Src);
38960b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
38970b57cec5SDimitry Andric }
38980b57cec5SDimitry Andric }
38990b57cec5SDimitry Andric
39000b57cec5SDimitry Andric // An induction variable will remain scalar if all users of the induction
39010b57cec5SDimitry Andric // variable and induction variable update remain scalar.
3902bdd1243dSDimitry Andric for (const auto &Induction : Legal->getInductionVars()) {
39030b57cec5SDimitry Andric auto *Ind = Induction.first;
39040b57cec5SDimitry Andric auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
39050b57cec5SDimitry Andric
39065ffd83dbSDimitry Andric // If tail-folding is applied, the primary induction variable will be used
39075ffd83dbSDimitry Andric // to feed a vector compare.
39085ffd83dbSDimitry Andric if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
39095ffd83dbSDimitry Andric continue;
39105ffd83dbSDimitry Andric
39114824e7fdSDimitry Andric // Returns true if \p Indvar is a pointer induction that is used directly by
39124824e7fdSDimitry Andric // load/store instruction \p I.
39134824e7fdSDimitry Andric auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
39144824e7fdSDimitry Andric Instruction *I) {
39154824e7fdSDimitry Andric return Induction.second.getKind() ==
39164824e7fdSDimitry Andric InductionDescriptor::IK_PtrInduction &&
39174824e7fdSDimitry Andric (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
39184824e7fdSDimitry Andric Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
39194824e7fdSDimitry Andric };
39204824e7fdSDimitry Andric
39210b57cec5SDimitry Andric // Determine if all users of the induction variable are scalar after
39220b57cec5SDimitry Andric // vectorization.
39230b57cec5SDimitry Andric auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
39240b57cec5SDimitry Andric auto *I = cast<Instruction>(U);
39254824e7fdSDimitry Andric return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
39264824e7fdSDimitry Andric IsDirectLoadStoreFromPtrIndvar(Ind, I);
39270b57cec5SDimitry Andric });
39280b57cec5SDimitry Andric if (!ScalarInd)
39290b57cec5SDimitry Andric continue;
39300b57cec5SDimitry Andric
39310b57cec5SDimitry Andric // Determine if all users of the induction variable update instruction are
39320b57cec5SDimitry Andric // scalar after vectorization.
39330b57cec5SDimitry Andric auto ScalarIndUpdate =
39340b57cec5SDimitry Andric llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
39350b57cec5SDimitry Andric auto *I = cast<Instruction>(U);
39364824e7fdSDimitry Andric return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
39374824e7fdSDimitry Andric IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
39380b57cec5SDimitry Andric });
39390b57cec5SDimitry Andric if (!ScalarIndUpdate)
39400b57cec5SDimitry Andric continue;
39410b57cec5SDimitry Andric
39420b57cec5SDimitry Andric // The induction variable and its update instruction will remain scalar.
39430b57cec5SDimitry Andric Worklist.insert(Ind);
39440b57cec5SDimitry Andric Worklist.insert(IndUpdate);
39450b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
39460b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
39470b57cec5SDimitry Andric << "\n");
39480b57cec5SDimitry Andric }
39490b57cec5SDimitry Andric
39500b57cec5SDimitry Andric Scalars[VF].insert(Worklist.begin(), Worklist.end());
39510b57cec5SDimitry Andric }
39520b57cec5SDimitry Andric
isScalarWithPredication(Instruction * I,ElementCount VF) const395304eeddc0SDimitry Andric bool LoopVectorizationCostModel::isScalarWithPredication(
395404eeddc0SDimitry Andric Instruction *I, ElementCount VF) const {
3955bdd1243dSDimitry Andric if (!isPredicatedInst(I))
39560b57cec5SDimitry Andric return false;
3957bdd1243dSDimitry Andric
3958bdd1243dSDimitry Andric // Do we have a non-scalar lowering for this predicated
3959bdd1243dSDimitry Andric // instruction? No - it is scalar with predication.
39600b57cec5SDimitry Andric switch(I->getOpcode()) {
39610b57cec5SDimitry Andric default:
3962bdd1243dSDimitry Andric return true;
3963fe013be4SDimitry Andric case Instruction::Call:
3964c9157d92SDimitry Andric if (VF.isScalar())
3965c9157d92SDimitry Andric return true;
3966c9157d92SDimitry Andric return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF))
3967c9157d92SDimitry Andric .Kind == CM_Scalarize;
39680b57cec5SDimitry Andric case Instruction::Load:
39690b57cec5SDimitry Andric case Instruction::Store: {
39700b57cec5SDimitry Andric auto *Ptr = getLoadStorePointerOperand(I);
3971fe6060f1SDimitry Andric auto *Ty = getLoadStoreType(I);
397204eeddc0SDimitry Andric Type *VTy = Ty;
397304eeddc0SDimitry Andric if (VF.isVector())
397404eeddc0SDimitry Andric VTy = VectorType::get(Ty, VF);
39755ffd83dbSDimitry Andric const Align Alignment = getLoadStoreAlignment(I);
3976480093f4SDimitry Andric return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
397704eeddc0SDimitry Andric TTI.isLegalMaskedGather(VTy, Alignment))
3978480093f4SDimitry Andric : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
397904eeddc0SDimitry Andric TTI.isLegalMaskedScatter(VTy, Alignment));
39800b57cec5SDimitry Andric }
39810b57cec5SDimitry Andric case Instruction::UDiv:
39820b57cec5SDimitry Andric case Instruction::SDiv:
39830b57cec5SDimitry Andric case Instruction::SRem:
3984bdd1243dSDimitry Andric case Instruction::URem: {
3985bdd1243dSDimitry Andric // We have the option to use the safe-divisor idiom to avoid predication.
3986bdd1243dSDimitry Andric // The cost based decision here will always select safe-divisor for
3987bdd1243dSDimitry Andric // scalable vectors as scalarization isn't legal.
3988bdd1243dSDimitry Andric const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
3989bdd1243dSDimitry Andric return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
3990bdd1243dSDimitry Andric }
3991bdd1243dSDimitry Andric }
3992bdd1243dSDimitry Andric }
3993bdd1243dSDimitry Andric
isPredicatedInst(Instruction * I) const3994bdd1243dSDimitry Andric bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
3995bdd1243dSDimitry Andric if (!blockNeedsPredicationForAnyReason(I->getParent()))
3996bdd1243dSDimitry Andric return false;
3997bdd1243dSDimitry Andric
3998bdd1243dSDimitry Andric // Can we prove this instruction is safe to unconditionally execute?
3999bdd1243dSDimitry Andric // If not, we must use some form of predication.
4000bdd1243dSDimitry Andric switch(I->getOpcode()) {
4001bdd1243dSDimitry Andric default:
4002bdd1243dSDimitry Andric return false;
4003bdd1243dSDimitry Andric case Instruction::Load:
4004bdd1243dSDimitry Andric case Instruction::Store: {
4005bdd1243dSDimitry Andric if (!Legal->isMaskRequired(I))
4006bdd1243dSDimitry Andric return false;
4007bdd1243dSDimitry Andric // When we know the load's address is loop invariant and the instruction
4008bdd1243dSDimitry Andric // in the original scalar loop was unconditionally executed then we
4009bdd1243dSDimitry Andric // don't need to mark it as a predicated instruction. Tail folding may
4010bdd1243dSDimitry Andric // introduce additional predication, but we're guaranteed to always have
4011bdd1243dSDimitry Andric // at least one active lane. We call Legal->blockNeedsPredication here
4012bdd1243dSDimitry Andric // because it doesn't query tail-folding. For stores, we need to prove
4013bdd1243dSDimitry Andric // both speculation safety (which follows from the same argument as loads),
4014bdd1243dSDimitry Andric // but also must prove the value being stored is correct. The easiest
4015bdd1243dSDimitry Andric // form of the later is to require that all values stored are the same.
4016fe013be4SDimitry Andric if (Legal->isInvariant(getLoadStorePointerOperand(I)) &&
4017bdd1243dSDimitry Andric (isa<LoadInst>(I) ||
4018bdd1243dSDimitry Andric (isa<StoreInst>(I) &&
4019bdd1243dSDimitry Andric TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) &&
4020bdd1243dSDimitry Andric !Legal->blockNeedsPredication(I->getParent()))
4021bdd1243dSDimitry Andric return false;
4022bdd1243dSDimitry Andric return true;
4023bdd1243dSDimitry Andric }
4024bdd1243dSDimitry Andric case Instruction::UDiv:
4025bdd1243dSDimitry Andric case Instruction::SDiv:
4026bdd1243dSDimitry Andric case Instruction::SRem:
40270b57cec5SDimitry Andric case Instruction::URem:
4028fcaf7f86SDimitry Andric // TODO: We can use the loop-preheader as context point here and get
4029fcaf7f86SDimitry Andric // context sensitive reasoning
4030fcaf7f86SDimitry Andric return !isSafeToSpeculativelyExecute(I);
4031fe013be4SDimitry Andric case Instruction::Call:
4032fe013be4SDimitry Andric return Legal->isMaskRequired(I);
40330b57cec5SDimitry Andric }
4034bdd1243dSDimitry Andric }
4035bdd1243dSDimitry Andric
4036bdd1243dSDimitry Andric std::pair<InstructionCost, InstructionCost>
getDivRemSpeculationCost(Instruction * I,ElementCount VF) const4037bdd1243dSDimitry Andric LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
4038bdd1243dSDimitry Andric ElementCount VF) const {
4039bdd1243dSDimitry Andric assert(I->getOpcode() == Instruction::UDiv ||
4040bdd1243dSDimitry Andric I->getOpcode() == Instruction::SDiv ||
4041bdd1243dSDimitry Andric I->getOpcode() == Instruction::SRem ||
4042bdd1243dSDimitry Andric I->getOpcode() == Instruction::URem);
4043bdd1243dSDimitry Andric assert(!isSafeToSpeculativelyExecute(I));
4044bdd1243dSDimitry Andric
4045bdd1243dSDimitry Andric const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4046bdd1243dSDimitry Andric
4047bdd1243dSDimitry Andric // Scalarization isn't legal for scalable vector types
4048bdd1243dSDimitry Andric InstructionCost ScalarizationCost = InstructionCost::getInvalid();
4049bdd1243dSDimitry Andric if (!VF.isScalable()) {
4050bdd1243dSDimitry Andric // Get the scalarization cost and scale this amount by the probability of
4051bdd1243dSDimitry Andric // executing the predicated block. If the instruction is not predicated,
4052bdd1243dSDimitry Andric // we fall through to the next case.
4053bdd1243dSDimitry Andric ScalarizationCost = 0;
4054bdd1243dSDimitry Andric
4055bdd1243dSDimitry Andric // These instructions have a non-void type, so account for the phi nodes
4056bdd1243dSDimitry Andric // that we will create. This cost is likely to be zero. The phi node
4057bdd1243dSDimitry Andric // cost, if any, should be scaled by the block probability because it
4058bdd1243dSDimitry Andric // models a copy at the end of each predicated block.
4059bdd1243dSDimitry Andric ScalarizationCost += VF.getKnownMinValue() *
4060bdd1243dSDimitry Andric TTI.getCFInstrCost(Instruction::PHI, CostKind);
4061bdd1243dSDimitry Andric
4062bdd1243dSDimitry Andric // The cost of the non-predicated instruction.
4063bdd1243dSDimitry Andric ScalarizationCost += VF.getKnownMinValue() *
4064bdd1243dSDimitry Andric TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
4065bdd1243dSDimitry Andric
4066bdd1243dSDimitry Andric // The cost of insertelement and extractelement instructions needed for
4067bdd1243dSDimitry Andric // scalarization.
4068bdd1243dSDimitry Andric ScalarizationCost += getScalarizationOverhead(I, VF, CostKind);
4069bdd1243dSDimitry Andric
4070bdd1243dSDimitry Andric // Scale the cost by the probability of executing the predicated blocks.
4071bdd1243dSDimitry Andric // This assumes the predicated block for each vector lane is equally
4072bdd1243dSDimitry Andric // likely.
4073bdd1243dSDimitry Andric ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
4074bdd1243dSDimitry Andric }
4075bdd1243dSDimitry Andric InstructionCost SafeDivisorCost = 0;
4076bdd1243dSDimitry Andric
4077bdd1243dSDimitry Andric auto *VecTy = ToVectorTy(I->getType(), VF);
4078bdd1243dSDimitry Andric
4079bdd1243dSDimitry Andric // The cost of the select guard to ensure all lanes are well defined
4080bdd1243dSDimitry Andric // after we speculate above any internal control flow.
4081bdd1243dSDimitry Andric SafeDivisorCost += TTI.getCmpSelInstrCost(
4082bdd1243dSDimitry Andric Instruction::Select, VecTy,
4083bdd1243dSDimitry Andric ToVectorTy(Type::getInt1Ty(I->getContext()), VF),
4084bdd1243dSDimitry Andric CmpInst::BAD_ICMP_PREDICATE, CostKind);
4085bdd1243dSDimitry Andric
4086bdd1243dSDimitry Andric // Certain instructions can be cheaper to vectorize if they have a constant
4087bdd1243dSDimitry Andric // second vector operand. One example of this are shifts on x86.
4088bdd1243dSDimitry Andric Value *Op2 = I->getOperand(1);
4089bdd1243dSDimitry Andric auto Op2Info = TTI.getOperandInfo(Op2);
4090fe013be4SDimitry Andric if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
4091fe013be4SDimitry Andric Legal->isInvariant(Op2))
4092bdd1243dSDimitry Andric Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
4093bdd1243dSDimitry Andric
4094bdd1243dSDimitry Andric SmallVector<const Value *, 4> Operands(I->operand_values());
4095bdd1243dSDimitry Andric SafeDivisorCost += TTI.getArithmeticInstrCost(
4096bdd1243dSDimitry Andric I->getOpcode(), VecTy, CostKind,
4097bdd1243dSDimitry Andric {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
4098bdd1243dSDimitry Andric Op2Info, Operands, I);
4099bdd1243dSDimitry Andric return {ScalarizationCost, SafeDivisorCost};
41000b57cec5SDimitry Andric }
41010b57cec5SDimitry Andric
interleavedAccessCanBeWidened(Instruction * I,ElementCount VF)4102e8d8bef9SDimitry Andric bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4103e8d8bef9SDimitry Andric Instruction *I, ElementCount VF) {
41040b57cec5SDimitry Andric assert(isAccessInterleaved(I) && "Expecting interleaved access.");
41050b57cec5SDimitry Andric assert(getWideningDecision(I, VF) == CM_Unknown &&
41060b57cec5SDimitry Andric "Decision should not be set yet.");
41070b57cec5SDimitry Andric auto *Group = getInterleavedAccessGroup(I);
41080b57cec5SDimitry Andric assert(Group && "Must have a group.");
41090b57cec5SDimitry Andric
41100b57cec5SDimitry Andric // If the instruction's allocated size doesn't equal it's type size, it
41110b57cec5SDimitry Andric // requires padding and will be scalarized.
41120b57cec5SDimitry Andric auto &DL = I->getModule()->getDataLayout();
4113fe6060f1SDimitry Andric auto *ScalarTy = getLoadStoreType(I);
4114d409305fSDimitry Andric if (hasIrregularType(ScalarTy, DL))
41150b57cec5SDimitry Andric return false;
41160b57cec5SDimitry Andric
411781ad6265SDimitry Andric // If the group involves a non-integral pointer, we may not be able to
411881ad6265SDimitry Andric // losslessly cast all values to a common type.
411981ad6265SDimitry Andric unsigned InterleaveFactor = Group->getFactor();
412081ad6265SDimitry Andric bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
412181ad6265SDimitry Andric for (unsigned i = 0; i < InterleaveFactor; i++) {
412281ad6265SDimitry Andric Instruction *Member = Group->getMember(i);
412381ad6265SDimitry Andric if (!Member)
412481ad6265SDimitry Andric continue;
412581ad6265SDimitry Andric auto *MemberTy = getLoadStoreType(Member);
412681ad6265SDimitry Andric bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
412781ad6265SDimitry Andric // Don't coerce non-integral pointers to integers or vice versa.
412881ad6265SDimitry Andric if (MemberNI != ScalarNI) {
412981ad6265SDimitry Andric // TODO: Consider adding special nullptr value case here
413081ad6265SDimitry Andric return false;
413181ad6265SDimitry Andric } else if (MemberNI && ScalarNI &&
413281ad6265SDimitry Andric ScalarTy->getPointerAddressSpace() !=
413381ad6265SDimitry Andric MemberTy->getPointerAddressSpace()) {
413481ad6265SDimitry Andric return false;
413581ad6265SDimitry Andric }
413681ad6265SDimitry Andric }
413781ad6265SDimitry Andric
41380b57cec5SDimitry Andric // Check if masking is required.
41390b57cec5SDimitry Andric // A Group may need masking for one of two reasons: it resides in a block that
4140349cc55cSDimitry Andric // needs predication, or it was decided to use masking to deal with gaps
4141349cc55cSDimitry Andric // (either a gap at the end of a load-access that may result in a speculative
4142349cc55cSDimitry Andric // load, or any gaps in a store-access).
41430b57cec5SDimitry Andric bool PredicatedAccessRequiresMasking =
4144349cc55cSDimitry Andric blockNeedsPredicationForAnyReason(I->getParent()) &&
4145349cc55cSDimitry Andric Legal->isMaskRequired(I);
4146349cc55cSDimitry Andric bool LoadAccessWithGapsRequiresEpilogMasking =
4147349cc55cSDimitry Andric isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
4148349cc55cSDimitry Andric !isScalarEpilogueAllowed();
4149349cc55cSDimitry Andric bool StoreAccessWithGapsRequiresMasking =
4150349cc55cSDimitry Andric isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
4151349cc55cSDimitry Andric if (!PredicatedAccessRequiresMasking &&
4152349cc55cSDimitry Andric !LoadAccessWithGapsRequiresEpilogMasking &&
4153349cc55cSDimitry Andric !StoreAccessWithGapsRequiresMasking)
41540b57cec5SDimitry Andric return true;
41550b57cec5SDimitry Andric
41560b57cec5SDimitry Andric // If masked interleaving is required, we expect that the user/target had
41570b57cec5SDimitry Andric // enabled it, because otherwise it either wouldn't have been created or
41580b57cec5SDimitry Andric // it should have been invalidated by the CostModel.
41590b57cec5SDimitry Andric assert(useMaskedInterleavedAccesses(TTI) &&
41600b57cec5SDimitry Andric "Masked interleave-groups for predicated accesses are not enabled.");
41610b57cec5SDimitry Andric
4162349cc55cSDimitry Andric if (Group->isReverse())
4163349cc55cSDimitry Andric return false;
4164349cc55cSDimitry Andric
4165fe6060f1SDimitry Andric auto *Ty = getLoadStoreType(I);
41665ffd83dbSDimitry Andric const Align Alignment = getLoadStoreAlignment(I);
41678bcb0991SDimitry Andric return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
41688bcb0991SDimitry Andric : TTI.isLegalMaskedStore(Ty, Alignment);
41690b57cec5SDimitry Andric }
41700b57cec5SDimitry Andric
memoryInstructionCanBeWidened(Instruction * I,ElementCount VF)4171e8d8bef9SDimitry Andric bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4172e8d8bef9SDimitry Andric Instruction *I, ElementCount VF) {
41730b57cec5SDimitry Andric // Get and ensure we have a valid memory instruction.
4174349cc55cSDimitry Andric assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
41750b57cec5SDimitry Andric
41760b57cec5SDimitry Andric auto *Ptr = getLoadStorePointerOperand(I);
4177349cc55cSDimitry Andric auto *ScalarTy = getLoadStoreType(I);
41780b57cec5SDimitry Andric
41790b57cec5SDimitry Andric // In order to be widened, the pointer should be consecutive, first of all.
4180349cc55cSDimitry Andric if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
41810b57cec5SDimitry Andric return false;
41820b57cec5SDimitry Andric
41830b57cec5SDimitry Andric // If the instruction is a store located in a predicated block, it will be
41840b57cec5SDimitry Andric // scalarized.
418504eeddc0SDimitry Andric if (isScalarWithPredication(I, VF))
41860b57cec5SDimitry Andric return false;
41870b57cec5SDimitry Andric
41880b57cec5SDimitry Andric // If the instruction's allocated size doesn't equal it's type size, it
41890b57cec5SDimitry Andric // requires padding and will be scalarized.
41900b57cec5SDimitry Andric auto &DL = I->getModule()->getDataLayout();
4191d409305fSDimitry Andric if (hasIrregularType(ScalarTy, DL))
41920b57cec5SDimitry Andric return false;
41930b57cec5SDimitry Andric
41940b57cec5SDimitry Andric return true;
41950b57cec5SDimitry Andric }
41960b57cec5SDimitry Andric
collectLoopUniforms(ElementCount VF)4197e8d8bef9SDimitry Andric void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
41980b57cec5SDimitry Andric // We should not collect Uniforms more than once per VF. Right now,
41990b57cec5SDimitry Andric // this function is called from collectUniformsAndScalars(), which
42000b57cec5SDimitry Andric // already does this check. Collecting Uniforms for VF=1 does not make any
42010b57cec5SDimitry Andric // sense.
42020b57cec5SDimitry Andric
4203fe013be4SDimitry Andric assert(VF.isVector() && !Uniforms.contains(VF) &&
42040b57cec5SDimitry Andric "This function should not be visited twice for the same VF");
42050b57cec5SDimitry Andric
42060b57cec5SDimitry Andric // Visit the list of Uniforms. If we'll not find any uniform value, we'll
42070b57cec5SDimitry Andric // not analyze again. Uniforms.count(VF) will return 1.
42080b57cec5SDimitry Andric Uniforms[VF].clear();
42090b57cec5SDimitry Andric
42100b57cec5SDimitry Andric // We now know that the loop is vectorizable!
42110b57cec5SDimitry Andric // Collect instructions inside the loop that will remain uniform after
42120b57cec5SDimitry Andric // vectorization.
42130b57cec5SDimitry Andric
42140b57cec5SDimitry Andric // Global values, params and instructions outside of current loop are out of
42150b57cec5SDimitry Andric // scope.
42160b57cec5SDimitry Andric auto isOutOfScope = [&](Value *V) -> bool {
42170b57cec5SDimitry Andric Instruction *I = dyn_cast<Instruction>(V);
42180b57cec5SDimitry Andric return (!I || !TheLoop->contains(I));
42190b57cec5SDimitry Andric };
42200b57cec5SDimitry Andric
4221349cc55cSDimitry Andric // Worklist containing uniform instructions demanding lane 0.
42220b57cec5SDimitry Andric SetVector<Instruction *> Worklist;
42230b57cec5SDimitry Andric BasicBlock *Latch = TheLoop->getLoopLatch();
42240b57cec5SDimitry Andric
4225349cc55cSDimitry Andric // Add uniform instructions demanding lane 0 to the worklist. Instructions
4226349cc55cSDimitry Andric // that are scalar with predication must not be considered uniform after
4227349cc55cSDimitry Andric // vectorization, because that would create an erroneous replicating region
4228349cc55cSDimitry Andric // where only a single instance out of VF should be formed.
4229480093f4SDimitry Andric // TODO: optimize such seldom cases if found important, see PR40816.
4230480093f4SDimitry Andric auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4231e8d8bef9SDimitry Andric if (isOutOfScope(I)) {
4232e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
4233e8d8bef9SDimitry Andric << *I << "\n");
4234e8d8bef9SDimitry Andric return;
4235e8d8bef9SDimitry Andric }
423604eeddc0SDimitry Andric if (isScalarWithPredication(I, VF)) {
4237480093f4SDimitry Andric LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4238480093f4SDimitry Andric << *I << "\n");
4239480093f4SDimitry Andric return;
4240480093f4SDimitry Andric }
4241480093f4SDimitry Andric LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4242480093f4SDimitry Andric Worklist.insert(I);
4243480093f4SDimitry Andric };
4244480093f4SDimitry Andric
42450b57cec5SDimitry Andric // Start with the conditional branch. If the branch condition is an
42460b57cec5SDimitry Andric // instruction contained in the loop that is only used by the branch, it is
42470b57cec5SDimitry Andric // uniform.
42480b57cec5SDimitry Andric auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4249480093f4SDimitry Andric if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4250480093f4SDimitry Andric addToWorklistIfAllowed(Cmp);
42510b57cec5SDimitry Andric
4252fe013be4SDimitry Andric auto PrevVF = VF.divideCoefficientBy(2);
4253bdd1243dSDimitry Andric // Return true if all lanes perform the same memory operation, and we can
4254bdd1243dSDimitry Andric // thus chose to execute only one.
4255bdd1243dSDimitry Andric auto isUniformMemOpUse = [&](Instruction *I) {
4256fe013be4SDimitry Andric // If the value was already known to not be uniform for the previous
4257fe013be4SDimitry Andric // (smaller VF), it cannot be uniform for the larger VF.
4258fe013be4SDimitry Andric if (PrevVF.isVector()) {
4259fe013be4SDimitry Andric auto Iter = Uniforms.find(PrevVF);
4260fe013be4SDimitry Andric if (Iter != Uniforms.end() && !Iter->second.contains(I))
4261fe013be4SDimitry Andric return false;
4262fe013be4SDimitry Andric }
4263fe013be4SDimitry Andric if (!Legal->isUniformMemOp(*I, VF))
4264bdd1243dSDimitry Andric return false;
4265bdd1243dSDimitry Andric if (isa<LoadInst>(I))
4266bdd1243dSDimitry Andric // Loading the same address always produces the same result - at least
4267bdd1243dSDimitry Andric // assuming aliasing and ordering which have already been checked.
4268bdd1243dSDimitry Andric return true;
4269bdd1243dSDimitry Andric // Storing the same value on every iteration.
4270bdd1243dSDimitry Andric return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
4271bdd1243dSDimitry Andric };
4272bdd1243dSDimitry Andric
4273e8d8bef9SDimitry Andric auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
42740b57cec5SDimitry Andric InstWidening WideningDecision = getWideningDecision(I, VF);
42750b57cec5SDimitry Andric assert(WideningDecision != CM_Unknown &&
42760b57cec5SDimitry Andric "Widening decision should be ready at this moment");
42770b57cec5SDimitry Andric
4278bdd1243dSDimitry Andric if (isUniformMemOpUse(I))
4279e8d8bef9SDimitry Andric return true;
4280e8d8bef9SDimitry Andric
42810b57cec5SDimitry Andric return (WideningDecision == CM_Widen ||
42820b57cec5SDimitry Andric WideningDecision == CM_Widen_Reverse ||
42830b57cec5SDimitry Andric WideningDecision == CM_Interleave);
42840b57cec5SDimitry Andric };
4285e8d8bef9SDimitry Andric
4286e8d8bef9SDimitry Andric // Returns true if Ptr is the pointer operand of a memory access instruction
4287dbbaf778SDimitry Andric // I, I is known to not require scalarization, and the pointer is not also
4288dbbaf778SDimitry Andric // stored.
4289e8d8bef9SDimitry Andric auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4290fe013be4SDimitry Andric if (isa<StoreInst>(I) && I->getOperand(0) == Ptr)
4291fe013be4SDimitry Andric return false;
4292fe013be4SDimitry Andric return getLoadStorePointerOperand(I) == Ptr &&
4293fe013be4SDimitry Andric (isUniformDecision(I, VF) || Legal->isInvariant(Ptr));
4294e8d8bef9SDimitry Andric };
4295e8d8bef9SDimitry Andric
4296e8d8bef9SDimitry Andric // Holds a list of values which are known to have at least one uniform use.
4297e8d8bef9SDimitry Andric // Note that there may be other uses which aren't uniform. A "uniform use"
4298e8d8bef9SDimitry Andric // here is something which only demands lane 0 of the unrolled iterations;
4299e8d8bef9SDimitry Andric // it does not imply that all lanes produce the same value (e.g. this is not
4300e8d8bef9SDimitry Andric // the usual meaning of uniform)
4301fe6060f1SDimitry Andric SetVector<Value *> HasUniformUse;
4302e8d8bef9SDimitry Andric
4303e8d8bef9SDimitry Andric // Scan the loop for instructions which are either a) known to have only
4304e8d8bef9SDimitry Andric // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
43050b57cec5SDimitry Andric for (auto *BB : TheLoop->blocks())
43060b57cec5SDimitry Andric for (auto &I : *BB) {
43076e75b2fbSDimitry Andric if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
43086e75b2fbSDimitry Andric switch (II->getIntrinsicID()) {
43096e75b2fbSDimitry Andric case Intrinsic::sideeffect:
43106e75b2fbSDimitry Andric case Intrinsic::experimental_noalias_scope_decl:
43116e75b2fbSDimitry Andric case Intrinsic::assume:
43126e75b2fbSDimitry Andric case Intrinsic::lifetime_start:
43136e75b2fbSDimitry Andric case Intrinsic::lifetime_end:
43146e75b2fbSDimitry Andric if (TheLoop->hasLoopInvariantOperands(&I))
43156e75b2fbSDimitry Andric addToWorklistIfAllowed(&I);
43166e75b2fbSDimitry Andric break;
43176e75b2fbSDimitry Andric default:
43186e75b2fbSDimitry Andric break;
43196e75b2fbSDimitry Andric }
43206e75b2fbSDimitry Andric }
43216e75b2fbSDimitry Andric
4322349cc55cSDimitry Andric // ExtractValue instructions must be uniform, because the operands are
4323349cc55cSDimitry Andric // known to be loop-invariant.
4324349cc55cSDimitry Andric if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
4325349cc55cSDimitry Andric assert(isOutOfScope(EVI->getAggregateOperand()) &&
4326349cc55cSDimitry Andric "Expected aggregate value to be loop invariant");
4327349cc55cSDimitry Andric addToWorklistIfAllowed(EVI);
4328349cc55cSDimitry Andric continue;
4329349cc55cSDimitry Andric }
4330349cc55cSDimitry Andric
43310b57cec5SDimitry Andric // If there's no pointer operand, there's nothing to do.
4332e8d8bef9SDimitry Andric auto *Ptr = getLoadStorePointerOperand(&I);
43330b57cec5SDimitry Andric if (!Ptr)
43340b57cec5SDimitry Andric continue;
43350b57cec5SDimitry Andric
4336bdd1243dSDimitry Andric if (isUniformMemOpUse(&I))
4337e8d8bef9SDimitry Andric addToWorklistIfAllowed(&I);
43380b57cec5SDimitry Andric
4339fe013be4SDimitry Andric if (isVectorizedMemAccessUse(&I, Ptr))
4340e8d8bef9SDimitry Andric HasUniformUse.insert(Ptr);
4341e8d8bef9SDimitry Andric }
43420b57cec5SDimitry Andric
4343e8d8bef9SDimitry Andric // Add to the worklist any operands which have *only* uniform (e.g. lane 0
4344e8d8bef9SDimitry Andric // demanding) users. Since loops are assumed to be in LCSSA form, this
4345e8d8bef9SDimitry Andric // disallows uses outside the loop as well.
4346e8d8bef9SDimitry Andric for (auto *V : HasUniformUse) {
4347e8d8bef9SDimitry Andric if (isOutOfScope(V))
4348e8d8bef9SDimitry Andric continue;
4349e8d8bef9SDimitry Andric auto *I = cast<Instruction>(V);
4350e8d8bef9SDimitry Andric auto UsersAreMemAccesses =
4351e8d8bef9SDimitry Andric llvm::all_of(I->users(), [&](User *U) -> bool {
4352e8d8bef9SDimitry Andric return isVectorizedMemAccessUse(cast<Instruction>(U), V);
4353e8d8bef9SDimitry Andric });
4354e8d8bef9SDimitry Andric if (UsersAreMemAccesses)
4355e8d8bef9SDimitry Andric addToWorklistIfAllowed(I);
4356e8d8bef9SDimitry Andric }
43570b57cec5SDimitry Andric
43580b57cec5SDimitry Andric // Expand Worklist in topological order: whenever a new instruction
43590b57cec5SDimitry Andric // is added , its users should be already inside Worklist. It ensures
43600b57cec5SDimitry Andric // a uniform instruction will only be used by uniform instructions.
43610b57cec5SDimitry Andric unsigned idx = 0;
43620b57cec5SDimitry Andric while (idx != Worklist.size()) {
43630b57cec5SDimitry Andric Instruction *I = Worklist[idx++];
43640b57cec5SDimitry Andric
4365bdd1243dSDimitry Andric for (auto *OV : I->operand_values()) {
43660b57cec5SDimitry Andric // isOutOfScope operands cannot be uniform instructions.
43670b57cec5SDimitry Andric if (isOutOfScope(OV))
43680b57cec5SDimitry Andric continue;
43690b57cec5SDimitry Andric // First order recurrence Phi's should typically be considered
43700b57cec5SDimitry Andric // non-uniform.
43710b57cec5SDimitry Andric auto *OP = dyn_cast<PHINode>(OV);
4372bdd1243dSDimitry Andric if (OP && Legal->isFixedOrderRecurrence(OP))
43730b57cec5SDimitry Andric continue;
43740b57cec5SDimitry Andric // If all the users of the operand are uniform, then add the
43750b57cec5SDimitry Andric // operand into the uniform worklist.
43760b57cec5SDimitry Andric auto *OI = cast<Instruction>(OV);
43770b57cec5SDimitry Andric if (llvm::all_of(OI->users(), [&](User *U) -> bool {
43780b57cec5SDimitry Andric auto *J = cast<Instruction>(U);
4379e8d8bef9SDimitry Andric return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
4380480093f4SDimitry Andric }))
4381480093f4SDimitry Andric addToWorklistIfAllowed(OI);
43820b57cec5SDimitry Andric }
43830b57cec5SDimitry Andric }
43840b57cec5SDimitry Andric
43850b57cec5SDimitry Andric // For an instruction to be added into Worklist above, all its users inside
43860b57cec5SDimitry Andric // the loop should also be in Worklist. However, this condition cannot be
43870b57cec5SDimitry Andric // true for phi nodes that form a cyclic dependence. We must process phi
43880b57cec5SDimitry Andric // nodes separately. An induction variable will remain uniform if all users
43890b57cec5SDimitry Andric // of the induction variable and induction variable update remain uniform.
43900b57cec5SDimitry Andric // The code below handles both pointer and non-pointer induction variables.
4391bdd1243dSDimitry Andric for (const auto &Induction : Legal->getInductionVars()) {
43920b57cec5SDimitry Andric auto *Ind = Induction.first;
43930b57cec5SDimitry Andric auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
43940b57cec5SDimitry Andric
43950b57cec5SDimitry Andric // Determine if all users of the induction variable are uniform after
43960b57cec5SDimitry Andric // vectorization.
43970b57cec5SDimitry Andric auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
43980b57cec5SDimitry Andric auto *I = cast<Instruction>(U);
43990b57cec5SDimitry Andric return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
44000b57cec5SDimitry Andric isVectorizedMemAccessUse(I, Ind);
44010b57cec5SDimitry Andric });
44020b57cec5SDimitry Andric if (!UniformInd)
44030b57cec5SDimitry Andric continue;
44040b57cec5SDimitry Andric
44050b57cec5SDimitry Andric // Determine if all users of the induction variable update instruction are
44060b57cec5SDimitry Andric // uniform after vectorization.
44070b57cec5SDimitry Andric auto UniformIndUpdate =
44080b57cec5SDimitry Andric llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
44090b57cec5SDimitry Andric auto *I = cast<Instruction>(U);
44100b57cec5SDimitry Andric return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
44110b57cec5SDimitry Andric isVectorizedMemAccessUse(I, IndUpdate);
44120b57cec5SDimitry Andric });
44130b57cec5SDimitry Andric if (!UniformIndUpdate)
44140b57cec5SDimitry Andric continue;
44150b57cec5SDimitry Andric
44160b57cec5SDimitry Andric // The induction variable and its update instruction will remain uniform.
4417480093f4SDimitry Andric addToWorklistIfAllowed(Ind);
4418480093f4SDimitry Andric addToWorklistIfAllowed(IndUpdate);
44190b57cec5SDimitry Andric }
44200b57cec5SDimitry Andric
44210b57cec5SDimitry Andric Uniforms[VF].insert(Worklist.begin(), Worklist.end());
44220b57cec5SDimitry Andric }
44230b57cec5SDimitry Andric
runtimeChecksRequired()44248bcb0991SDimitry Andric bool LoopVectorizationCostModel::runtimeChecksRequired() {
44258bcb0991SDimitry Andric LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
44260b57cec5SDimitry Andric
44270b57cec5SDimitry Andric if (Legal->getRuntimePointerChecking()->Need) {
44288bcb0991SDimitry Andric reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
44298bcb0991SDimitry Andric "runtime pointer checks needed. Enable vectorization of this "
44300b57cec5SDimitry Andric "loop with '#pragma clang loop vectorize(enable)' when "
44318bcb0991SDimitry Andric "compiling with -Os/-Oz",
44328bcb0991SDimitry Andric "CantVersionLoopWithOptForSize", ORE, TheLoop);
44338bcb0991SDimitry Andric return true;
44340b57cec5SDimitry Andric }
44350b57cec5SDimitry Andric
443681ad6265SDimitry Andric if (!PSE.getPredicate().isAlwaysTrue()) {
44378bcb0991SDimitry Andric reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
44388bcb0991SDimitry Andric "runtime SCEV checks needed. Enable vectorization of this "
44390b57cec5SDimitry Andric "loop with '#pragma clang loop vectorize(enable)' when "
44408bcb0991SDimitry Andric "compiling with -Os/-Oz",
44418bcb0991SDimitry Andric "CantVersionLoopWithOptForSize", ORE, TheLoop);
44428bcb0991SDimitry Andric return true;
44430b57cec5SDimitry Andric }
44440b57cec5SDimitry Andric
44450b57cec5SDimitry Andric // FIXME: Avoid specializing for stride==1 instead of bailing out.
44460b57cec5SDimitry Andric if (!Legal->getLAI()->getSymbolicStrides().empty()) {
44475ffd83dbSDimitry Andric reportVectorizationFailure("Runtime stride check for small trip count",
44488bcb0991SDimitry Andric "runtime stride == 1 checks needed. Enable vectorization of "
44495ffd83dbSDimitry Andric "this loop without such check by compiling with -Os/-Oz",
44508bcb0991SDimitry Andric "CantVersionLoopWithOptForSize", ORE, TheLoop);
44518bcb0991SDimitry Andric return true;
44528bcb0991SDimitry Andric }
44538bcb0991SDimitry Andric
44548bcb0991SDimitry Andric return false;
44558bcb0991SDimitry Andric }
44568bcb0991SDimitry Andric
4457fe6060f1SDimitry Andric ElementCount
getMaxLegalScalableVF(unsigned MaxSafeElements)4458fe6060f1SDimitry Andric LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
4459349cc55cSDimitry Andric if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
4460fe6060f1SDimitry Andric return ElementCount::getScalable(0);
4461fe6060f1SDimitry Andric
4462fe6060f1SDimitry Andric if (Hints->isScalableVectorizationDisabled()) {
4463fe6060f1SDimitry Andric reportVectorizationInfo("Scalable vectorization is explicitly disabled",
4464fe6060f1SDimitry Andric "ScalableVectorizationDisabled", ORE, TheLoop);
4465fe6060f1SDimitry Andric return ElementCount::getScalable(0);
4466fe6060f1SDimitry Andric }
4467fe6060f1SDimitry Andric
4468349cc55cSDimitry Andric LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
4469349cc55cSDimitry Andric
4470fe6060f1SDimitry Andric auto MaxScalableVF = ElementCount::getScalable(
4471fe6060f1SDimitry Andric std::numeric_limits<ElementCount::ScalarTy>::max());
4472fe6060f1SDimitry Andric
4473fe6060f1SDimitry Andric // Test that the loop-vectorizer can legalize all operations for this MaxVF.
4474fe6060f1SDimitry Andric // FIXME: While for scalable vectors this is currently sufficient, this should
4475fe6060f1SDimitry Andric // be replaced by a more detailed mechanism that filters out specific VFs,
4476fe6060f1SDimitry Andric // instead of invalidating vectorization for a whole set of VFs based on the
4477fe6060f1SDimitry Andric // MaxVF.
4478fe6060f1SDimitry Andric
4479fe6060f1SDimitry Andric // Disable scalable vectorization if the loop contains unsupported reductions.
4480fe6060f1SDimitry Andric if (!canVectorizeReductions(MaxScalableVF)) {
4481fe6060f1SDimitry Andric reportVectorizationInfo(
4482fe6060f1SDimitry Andric "Scalable vectorization not supported for the reduction "
4483fe6060f1SDimitry Andric "operations found in this loop.",
4484fe6060f1SDimitry Andric "ScalableVFUnfeasible", ORE, TheLoop);
4485fe6060f1SDimitry Andric return ElementCount::getScalable(0);
4486fe6060f1SDimitry Andric }
4487fe6060f1SDimitry Andric
4488fe6060f1SDimitry Andric // Disable scalable vectorization if the loop contains any instructions
4489fe6060f1SDimitry Andric // with element types not supported for scalable vectors.
4490fe6060f1SDimitry Andric if (any_of(ElementTypesInLoop, [&](Type *Ty) {
4491fe6060f1SDimitry Andric return !Ty->isVoidTy() &&
4492fe6060f1SDimitry Andric !this->TTI.isElementTypeLegalForScalableVector(Ty);
4493fe6060f1SDimitry Andric })) {
4494fe6060f1SDimitry Andric reportVectorizationInfo("Scalable vectorization is not supported "
4495fe6060f1SDimitry Andric "for all element types found in this loop.",
4496fe6060f1SDimitry Andric "ScalableVFUnfeasible", ORE, TheLoop);
4497fe6060f1SDimitry Andric return ElementCount::getScalable(0);
4498fe6060f1SDimitry Andric }
4499fe6060f1SDimitry Andric
4500fe6060f1SDimitry Andric if (Legal->isSafeForAnyVectorWidth())
4501fe6060f1SDimitry Andric return MaxScalableVF;
4502fe6060f1SDimitry Andric
4503fe6060f1SDimitry Andric // Limit MaxScalableVF by the maximum safe dependence distance.
4504fe013be4SDimitry Andric if (std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI))
4505fe013be4SDimitry Andric MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);
4506fe013be4SDimitry Andric else
4507fe013be4SDimitry Andric MaxScalableVF = ElementCount::getScalable(0);
4508fe013be4SDimitry Andric
4509fe6060f1SDimitry Andric if (!MaxScalableVF)
4510fe6060f1SDimitry Andric reportVectorizationInfo(
4511fe6060f1SDimitry Andric "Max legal vector width too small, scalable vectorization "
4512fe6060f1SDimitry Andric "unfeasible.",
4513fe6060f1SDimitry Andric "ScalableVFUnfeasible", ORE, TheLoop);
4514fe6060f1SDimitry Andric
4515fe6060f1SDimitry Andric return MaxScalableVF;
4516fe6060f1SDimitry Andric }
4517fe6060f1SDimitry Andric
computeFeasibleMaxVF(unsigned MaxTripCount,ElementCount UserVF,bool FoldTailByMasking)45180eae32dcSDimitry Andric FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
4519c9157d92SDimitry Andric unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
4520fe6060f1SDimitry Andric MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4521fe6060f1SDimitry Andric unsigned SmallestType, WidestType;
4522fe6060f1SDimitry Andric std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4523fe6060f1SDimitry Andric
4524fe6060f1SDimitry Andric // Get the maximum safe dependence distance in bits computed by LAA.
4525fe6060f1SDimitry Andric // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4526fe6060f1SDimitry Andric // the memory accesses that is most restrictive (involved in the smallest
4527fe6060f1SDimitry Andric // dependence distance).
4528fe6060f1SDimitry Andric unsigned MaxSafeElements =
4529fe013be4SDimitry Andric llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
4530fe6060f1SDimitry Andric
4531fe6060f1SDimitry Andric auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
4532fe6060f1SDimitry Andric auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
4533fe6060f1SDimitry Andric
4534fe6060f1SDimitry Andric LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
4535fe6060f1SDimitry Andric << ".\n");
4536fe6060f1SDimitry Andric LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
4537fe6060f1SDimitry Andric << ".\n");
4538fe6060f1SDimitry Andric
4539fe6060f1SDimitry Andric // First analyze the UserVF, fall back if the UserVF should be ignored.
4540fe6060f1SDimitry Andric if (UserVF) {
4541fe6060f1SDimitry Andric auto MaxSafeUserVF =
4542fe6060f1SDimitry Andric UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
4543fe6060f1SDimitry Andric
4544fe6060f1SDimitry Andric if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
4545fe6060f1SDimitry Andric // If `VF=vscale x N` is safe, then so is `VF=N`
4546fe6060f1SDimitry Andric if (UserVF.isScalable())
4547fe6060f1SDimitry Andric return FixedScalableVFPair(
4548fe6060f1SDimitry Andric ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
4549fe6060f1SDimitry Andric else
4550fe6060f1SDimitry Andric return UserVF;
4551fe6060f1SDimitry Andric }
4552fe6060f1SDimitry Andric
4553fe6060f1SDimitry Andric assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
4554fe6060f1SDimitry Andric
4555fe6060f1SDimitry Andric // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
4556fe6060f1SDimitry Andric // is better to ignore the hint and let the compiler choose a suitable VF.
4557fe6060f1SDimitry Andric if (!UserVF.isScalable()) {
4558fe6060f1SDimitry Andric LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4559fe6060f1SDimitry Andric << " is unsafe, clamping to max safe VF="
4560fe6060f1SDimitry Andric << MaxSafeFixedVF << ".\n");
4561fe6060f1SDimitry Andric ORE->emit([&]() {
4562fe6060f1SDimitry Andric return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4563fe6060f1SDimitry Andric TheLoop->getStartLoc(),
4564fe6060f1SDimitry Andric TheLoop->getHeader())
4565fe6060f1SDimitry Andric << "User-specified vectorization factor "
4566fe6060f1SDimitry Andric << ore::NV("UserVectorizationFactor", UserVF)
4567fe6060f1SDimitry Andric << " is unsafe, clamping to maximum safe vectorization factor "
4568fe6060f1SDimitry Andric << ore::NV("VectorizationFactor", MaxSafeFixedVF);
4569fe6060f1SDimitry Andric });
4570fe6060f1SDimitry Andric return MaxSafeFixedVF;
4571fe6060f1SDimitry Andric }
4572fe6060f1SDimitry Andric
4573349cc55cSDimitry Andric if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
4574349cc55cSDimitry Andric LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4575349cc55cSDimitry Andric << " is ignored because scalable vectors are not "
4576349cc55cSDimitry Andric "available.\n");
4577349cc55cSDimitry Andric ORE->emit([&]() {
4578349cc55cSDimitry Andric return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4579349cc55cSDimitry Andric TheLoop->getStartLoc(),
4580349cc55cSDimitry Andric TheLoop->getHeader())
4581349cc55cSDimitry Andric << "User-specified vectorization factor "
4582349cc55cSDimitry Andric << ore::NV("UserVectorizationFactor", UserVF)
4583349cc55cSDimitry Andric << " is ignored because the target does not support scalable "
4584349cc55cSDimitry Andric "vectors. The compiler will pick a more suitable value.";
4585349cc55cSDimitry Andric });
4586349cc55cSDimitry Andric } else {
4587fe6060f1SDimitry Andric LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4588fe6060f1SDimitry Andric << " is unsafe. Ignoring scalable UserVF.\n");
4589fe6060f1SDimitry Andric ORE->emit([&]() {
4590fe6060f1SDimitry Andric return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4591fe6060f1SDimitry Andric TheLoop->getStartLoc(),
4592fe6060f1SDimitry Andric TheLoop->getHeader())
4593fe6060f1SDimitry Andric << "User-specified vectorization factor "
4594fe6060f1SDimitry Andric << ore::NV("UserVectorizationFactor", UserVF)
4595fe6060f1SDimitry Andric << " is unsafe. Ignoring the hint to let the compiler pick a "
4596349cc55cSDimitry Andric "more suitable value.";
4597fe6060f1SDimitry Andric });
4598fe6060f1SDimitry Andric }
4599349cc55cSDimitry Andric }
4600fe6060f1SDimitry Andric
4601fe6060f1SDimitry Andric LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4602fe6060f1SDimitry Andric << " / " << WidestType << " bits.\n");
4603fe6060f1SDimitry Andric
4604fe6060f1SDimitry Andric FixedScalableVFPair Result(ElementCount::getFixed(1),
4605fe6060f1SDimitry Andric ElementCount::getScalable(0));
46060eae32dcSDimitry Andric if (auto MaxVF =
4607c9157d92SDimitry Andric getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
46080eae32dcSDimitry Andric MaxSafeFixedVF, FoldTailByMasking))
4609fe6060f1SDimitry Andric Result.FixedVF = MaxVF;
4610fe6060f1SDimitry Andric
46110eae32dcSDimitry Andric if (auto MaxVF =
4612c9157d92SDimitry Andric getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
46130eae32dcSDimitry Andric MaxSafeScalableVF, FoldTailByMasking))
4614fe6060f1SDimitry Andric if (MaxVF.isScalable()) {
4615fe6060f1SDimitry Andric Result.ScalableVF = MaxVF;
4616fe6060f1SDimitry Andric LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
4617fe6060f1SDimitry Andric << "\n");
4618fe6060f1SDimitry Andric }
4619fe6060f1SDimitry Andric
4620fe6060f1SDimitry Andric return Result;
4621fe6060f1SDimitry Andric }
4622fe6060f1SDimitry Andric
4623fe6060f1SDimitry Andric FixedScalableVFPair
computeMaxVF(ElementCount UserVF,unsigned UserIC)4624e8d8bef9SDimitry Andric LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
46258bcb0991SDimitry Andric if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
46268bcb0991SDimitry Andric // TODO: It may by useful to do since it's still likely to be dynamically
46278bcb0991SDimitry Andric // uniform if the target can skip.
46288bcb0991SDimitry Andric reportVectorizationFailure(
46298bcb0991SDimitry Andric "Not inserting runtime ptr check for divergent target",
46308bcb0991SDimitry Andric "runtime pointer checks needed. Not enabled for divergent target",
46318bcb0991SDimitry Andric "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4632fe6060f1SDimitry Andric return FixedScalableVFPair::getNone();
46330b57cec5SDimitry Andric }
46340b57cec5SDimitry Andric
46358bcb0991SDimitry Andric unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4636c9157d92SDimitry Andric unsigned MaxTC = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
46370b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
46380b57cec5SDimitry Andric if (TC == 1) {
46398bcb0991SDimitry Andric reportVectorizationFailure("Single iteration (non) loop",
46408bcb0991SDimitry Andric "loop trip count is one, irrelevant for vectorization",
46418bcb0991SDimitry Andric "SingleIterationLoop", ORE, TheLoop);
4642fe6060f1SDimitry Andric return FixedScalableVFPair::getNone();
46430b57cec5SDimitry Andric }
46440b57cec5SDimitry Andric
46458bcb0991SDimitry Andric switch (ScalarEpilogueStatus) {
46468bcb0991SDimitry Andric case CM_ScalarEpilogueAllowed:
4647c9157d92SDimitry Andric return computeFeasibleMaxVF(MaxTC, UserVF, false);
4648e8d8bef9SDimitry Andric case CM_ScalarEpilogueNotAllowedUsePredicate:
4649bdd1243dSDimitry Andric [[fallthrough]];
46508bcb0991SDimitry Andric case CM_ScalarEpilogueNotNeededUsePredicate:
46518bcb0991SDimitry Andric LLVM_DEBUG(
46528bcb0991SDimitry Andric dbgs() << "LV: vector predicate hint/switch found.\n"
46538bcb0991SDimitry Andric << "LV: Not allowing scalar epilogue, creating predicated "
46548bcb0991SDimitry Andric << "vector loop.\n");
46558bcb0991SDimitry Andric break;
46568bcb0991SDimitry Andric case CM_ScalarEpilogueNotAllowedLowTripLoop:
46578bcb0991SDimitry Andric // fallthrough as a special case of OptForSize
46588bcb0991SDimitry Andric case CM_ScalarEpilogueNotAllowedOptSize:
46598bcb0991SDimitry Andric if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
46608bcb0991SDimitry Andric LLVM_DEBUG(
46618bcb0991SDimitry Andric dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
46628bcb0991SDimitry Andric else
46638bcb0991SDimitry Andric LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
46648bcb0991SDimitry Andric << "count.\n");
46650b57cec5SDimitry Andric
46668bcb0991SDimitry Andric // Bail if runtime checks are required, which are not good when optimising
46678bcb0991SDimitry Andric // for size.
46688bcb0991SDimitry Andric if (runtimeChecksRequired())
4669fe6060f1SDimitry Andric return FixedScalableVFPair::getNone();
4670e8d8bef9SDimitry Andric
46718bcb0991SDimitry Andric break;
46728bcb0991SDimitry Andric }
46730b57cec5SDimitry Andric
4674e8d8bef9SDimitry Andric // The only loops we can vectorize without a scalar epilogue, are loops with
4675e8d8bef9SDimitry Andric // a bottom-test and a single exiting block. We'd have to handle the fact
4676e8d8bef9SDimitry Andric // that not every instruction executes on the last iteration. This will
4677e8d8bef9SDimitry Andric // require a lane mask which varies through the vector loop body. (TODO)
4678e8d8bef9SDimitry Andric if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
4679e8d8bef9SDimitry Andric // If there was a tail-folding hint/switch, but we can't fold the tail by
4680e8d8bef9SDimitry Andric // masking, fallback to a vectorization with a scalar epilogue.
4681e8d8bef9SDimitry Andric if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4682e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4683e8d8bef9SDimitry Andric "scalar epilogue instead.\n");
4684e8d8bef9SDimitry Andric ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4685c9157d92SDimitry Andric return computeFeasibleMaxVF(MaxTC, UserVF, false);
4686e8d8bef9SDimitry Andric }
4687fe6060f1SDimitry Andric return FixedScalableVFPair::getNone();
4688e8d8bef9SDimitry Andric }
4689e8d8bef9SDimitry Andric
46908bcb0991SDimitry Andric // Now try the tail folding
46918bcb0991SDimitry Andric
46920b57cec5SDimitry Andric // Invalidate interleave groups that require an epilogue if we can't mask
46930b57cec5SDimitry Andric // the interleave-group.
46945ffd83dbSDimitry Andric if (!useMaskedInterleavedAccesses(TTI)) {
46955ffd83dbSDimitry Andric assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
46965ffd83dbSDimitry Andric "No decisions should have been taken at this point");
46975ffd83dbSDimitry Andric // Note: There is no need to invalidate any cost modeling decisions here, as
46985ffd83dbSDimitry Andric // non where taken so far.
46990b57cec5SDimitry Andric InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
47005ffd83dbSDimitry Andric }
47010b57cec5SDimitry Andric
4702c9157d92SDimitry Andric FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
4703fe013be4SDimitry Andric
4704fe6060f1SDimitry Andric // Avoid tail folding if the trip count is known to be a multiple of any VF
4705fe013be4SDimitry Andric // we choose.
4706fe013be4SDimitry Andric std::optional<unsigned> MaxPowerOf2RuntimeVF =
4707fe013be4SDimitry Andric MaxFactors.FixedVF.getFixedValue();
4708fe013be4SDimitry Andric if (MaxFactors.ScalableVF) {
4709fe013be4SDimitry Andric std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
4710fe013be4SDimitry Andric if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
4711fe013be4SDimitry Andric MaxPowerOf2RuntimeVF = std::max<unsigned>(
4712fe013be4SDimitry Andric *MaxPowerOf2RuntimeVF,
4713fe013be4SDimitry Andric *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
4714fe013be4SDimitry Andric } else
4715fe013be4SDimitry Andric MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
4716fe013be4SDimitry Andric }
4717fe013be4SDimitry Andric
4718fe013be4SDimitry Andric if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
4719fe013be4SDimitry Andric assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
4720fe6060f1SDimitry Andric "MaxFixedVF must be a power of 2");
4721fe013be4SDimitry Andric unsigned MaxVFtimesIC =
4722fe013be4SDimitry Andric UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF;
4723e8d8bef9SDimitry Andric ScalarEvolution *SE = PSE.getSE();
4724e8d8bef9SDimitry Andric const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
4725e8d8bef9SDimitry Andric const SCEV *ExitCount = SE->getAddExpr(
4726e8d8bef9SDimitry Andric BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
4727e8d8bef9SDimitry Andric const SCEV *Rem = SE->getURemExpr(
4728fe6060f1SDimitry Andric SE->applyLoopGuards(ExitCount, TheLoop),
4729fe6060f1SDimitry Andric SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
4730e8d8bef9SDimitry Andric if (Rem->isZero()) {
4731fe6060f1SDimitry Andric // Accept MaxFixedVF if we do not have a tail.
47320b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4733fe6060f1SDimitry Andric return MaxFactors;
47340b57cec5SDimitry Andric }
4735fe6060f1SDimitry Andric }
4736fe6060f1SDimitry Andric
47370b57cec5SDimitry Andric // If we don't know the precise trip count, or if the trip count that we
47380b57cec5SDimitry Andric // found modulo the vectorization factor is not zero, try to fold the tail
47390b57cec5SDimitry Andric // by masking.
47400b57cec5SDimitry Andric // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
47418bcb0991SDimitry Andric if (Legal->prepareToFoldTailByMasking()) {
4742fe013be4SDimitry Andric CanFoldTailByMasking = true;
4743fe6060f1SDimitry Andric return MaxFactors;
47440b57cec5SDimitry Andric }
47450b57cec5SDimitry Andric
4746e8d8bef9SDimitry Andric // If there was a tail-folding hint/switch, but we can't fold the tail by
4747e8d8bef9SDimitry Andric // masking, fallback to a vectorization with a scalar epilogue.
4748e8d8bef9SDimitry Andric if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4749e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4750e8d8bef9SDimitry Andric "scalar epilogue instead.\n");
4751e8d8bef9SDimitry Andric ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4752fe6060f1SDimitry Andric return MaxFactors;
4753e8d8bef9SDimitry Andric }
4754e8d8bef9SDimitry Andric
4755e8d8bef9SDimitry Andric if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
4756e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
4757fe6060f1SDimitry Andric return FixedScalableVFPair::getNone();
4758e8d8bef9SDimitry Andric }
4759e8d8bef9SDimitry Andric
47600b57cec5SDimitry Andric if (TC == 0) {
47618bcb0991SDimitry Andric reportVectorizationFailure(
47628bcb0991SDimitry Andric "Unable to calculate the loop count due to complex control flow",
47638bcb0991SDimitry Andric "unable to calculate the loop count due to complex control flow",
47648bcb0991SDimitry Andric "UnknownLoopCountComplexCFG", ORE, TheLoop);
4765fe6060f1SDimitry Andric return FixedScalableVFPair::getNone();
47660b57cec5SDimitry Andric }
47670b57cec5SDimitry Andric
47688bcb0991SDimitry Andric reportVectorizationFailure(
47698bcb0991SDimitry Andric "Cannot optimize for size and vectorize at the same time.",
47708bcb0991SDimitry Andric "cannot optimize for size and vectorize at the same time. "
47710b57cec5SDimitry Andric "Enable vectorization of this loop with '#pragma clang loop "
47728bcb0991SDimitry Andric "vectorize(enable)' when compiling with -Os/-Oz",
47738bcb0991SDimitry Andric "NoTailLoopWithOptForSize", ORE, TheLoop);
4774fe6060f1SDimitry Andric return FixedScalableVFPair::getNone();
47750b57cec5SDimitry Andric }
47760b57cec5SDimitry Andric
getMaximizedVFForTarget(unsigned MaxTripCount,unsigned SmallestType,unsigned WidestType,ElementCount MaxSafeVF,bool FoldTailByMasking)4777fe6060f1SDimitry Andric ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
4778c9157d92SDimitry Andric unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
477981ad6265SDimitry Andric ElementCount MaxSafeVF, bool FoldTailByMasking) {
4780fe6060f1SDimitry Andric bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
4781bdd1243dSDimitry Andric const TypeSize WidestRegister = TTI.getRegisterBitWidth(
4782fe6060f1SDimitry Andric ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4783fe6060f1SDimitry Andric : TargetTransformInfo::RGK_FixedWidthVector);
4784e8d8bef9SDimitry Andric
4785fe6060f1SDimitry Andric // Convenience function to return the minimum of two ElementCounts.
4786fe6060f1SDimitry Andric auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
4787fe6060f1SDimitry Andric assert((LHS.isScalable() == RHS.isScalable()) &&
4788fe6060f1SDimitry Andric "Scalable flags must match");
4789fe6060f1SDimitry Andric return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
4790fe6060f1SDimitry Andric };
47910b57cec5SDimitry Andric
47925ffd83dbSDimitry Andric // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
47935ffd83dbSDimitry Andric // Note that both WidestRegister and WidestType may not be a powers of 2.
4794fe6060f1SDimitry Andric auto MaxVectorElementCount = ElementCount::get(
4795fe013be4SDimitry Andric llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),
4796fe6060f1SDimitry Andric ComputeScalableMaxVF);
4797fe6060f1SDimitry Andric MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
47980b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4799fe6060f1SDimitry Andric << (MaxVectorElementCount * WidestType) << " bits.\n");
48000b57cec5SDimitry Andric
4801fe6060f1SDimitry Andric if (!MaxVectorElementCount) {
4802fe6060f1SDimitry Andric LLVM_DEBUG(dbgs() << "LV: The target has no "
4803fe6060f1SDimitry Andric << (ComputeScalableMaxVF ? "scalable" : "fixed")
4804fe6060f1SDimitry Andric << " vector registers.\n");
4805fe6060f1SDimitry Andric return ElementCount::getFixed(1);
48060b57cec5SDimitry Andric }
48070b57cec5SDimitry Andric
4808bdd1243dSDimitry Andric unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
4809bdd1243dSDimitry Andric if (MaxVectorElementCount.isScalable() &&
4810bdd1243dSDimitry Andric TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
4811bdd1243dSDimitry Andric auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
4812bdd1243dSDimitry Andric auto Min = Attr.getVScaleRangeMin();
4813bdd1243dSDimitry Andric WidestRegisterMinEC *= Min;
4814bdd1243dSDimitry Andric }
4815fe013be4SDimitry Andric
4816fe013be4SDimitry Andric // When a scalar epilogue is required, at least one iteration of the scalar
4817c9157d92SDimitry Andric // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
4818fe013be4SDimitry Andric // max VF that results in a dead vector loop.
4819c9157d92SDimitry Andric if (MaxTripCount > 0 && requiresScalarEpilogue(true))
4820c9157d92SDimitry Andric MaxTripCount -= 1;
4821fe013be4SDimitry Andric
4822c9157d92SDimitry Andric if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
4823c9157d92SDimitry Andric (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
4824c9157d92SDimitry Andric // If upper bound loop trip count (TC) is known at compile time there is no
4825c9157d92SDimitry Andric // point in choosing VF greater than TC (as done in the loop below). Select
4826c9157d92SDimitry Andric // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
4827c9157d92SDimitry Andric // scalable, we only fall back on a fixed VF when the TC is less than or
4828c9157d92SDimitry Andric // equal to the known number of lanes.
4829c9157d92SDimitry Andric auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount);
48300eae32dcSDimitry Andric LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
48310eae32dcSDimitry Andric "exceeding the constant trip count: "
4832c9157d92SDimitry Andric << ClampedUpperTripCount << "\n");
4833c9157d92SDimitry Andric return ElementCount::get(
4834c9157d92SDimitry Andric ClampedUpperTripCount,
4835c9157d92SDimitry Andric FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
4836fe6060f1SDimitry Andric }
4837fe6060f1SDimitry Andric
483881ad6265SDimitry Andric TargetTransformInfo::RegisterKind RegKind =
483981ad6265SDimitry Andric ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
484081ad6265SDimitry Andric : TargetTransformInfo::RGK_FixedWidthVector;
4841fe6060f1SDimitry Andric ElementCount MaxVF = MaxVectorElementCount;
4842c9157d92SDimitry Andric if (MaximizeBandwidth ||
4843c9157d92SDimitry Andric (MaximizeBandwidth.getNumOccurrences() == 0 &&
4844c9157d92SDimitry Andric (TTI.shouldMaximizeVectorBandwidth(RegKind) ||
4845c9157d92SDimitry Andric (UseWiderVFIfCallVariantsPresent && Legal->hasVectorCallVariants())))) {
4846fe6060f1SDimitry Andric auto MaxVectorElementCountMaxBW = ElementCount::get(
4847fe013be4SDimitry Andric llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
4848fe6060f1SDimitry Andric ComputeScalableMaxVF);
4849fe6060f1SDimitry Andric MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
4850fe6060f1SDimitry Andric
48510b57cec5SDimitry Andric // Collect all viable vectorization factors larger than the default MaxVF
4852fe6060f1SDimitry Andric // (i.e. MaxVectorElementCount).
4853e8d8bef9SDimitry Andric SmallVector<ElementCount, 8> VFs;
4854fe6060f1SDimitry Andric for (ElementCount VS = MaxVectorElementCount * 2;
4855fe6060f1SDimitry Andric ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
4856fe6060f1SDimitry Andric VFs.push_back(VS);
48570b57cec5SDimitry Andric
48580b57cec5SDimitry Andric // For each VF calculate its register usage.
48590b57cec5SDimitry Andric auto RUs = calculateRegisterUsage(VFs);
48600b57cec5SDimitry Andric
48610b57cec5SDimitry Andric // Select the largest VF which doesn't require more registers than existing
48620b57cec5SDimitry Andric // ones.
48630b57cec5SDimitry Andric for (int i = RUs.size() - 1; i >= 0; --i) {
48648bcb0991SDimitry Andric bool Selected = true;
48658bcb0991SDimitry Andric for (auto &pair : RUs[i].MaxLocalUsers) {
48668bcb0991SDimitry Andric unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
48678bcb0991SDimitry Andric if (pair.second > TargetNumRegisters)
48688bcb0991SDimitry Andric Selected = false;
48698bcb0991SDimitry Andric }
48708bcb0991SDimitry Andric if (Selected) {
4871fe6060f1SDimitry Andric MaxVF = VFs[i];
48720b57cec5SDimitry Andric break;
48730b57cec5SDimitry Andric }
48740b57cec5SDimitry Andric }
4875fe6060f1SDimitry Andric if (ElementCount MinVF =
4876fe6060f1SDimitry Andric TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
4877fe6060f1SDimitry Andric if (ElementCount::isKnownLT(MaxVF, MinVF)) {
48780b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
48790b57cec5SDimitry Andric << ") with target's minimum: " << MinVF << '\n');
48800b57cec5SDimitry Andric MaxVF = MinVF;
48810b57cec5SDimitry Andric }
48820b57cec5SDimitry Andric }
488381ad6265SDimitry Andric
488481ad6265SDimitry Andric // Invalidate any widening decisions we might have made, in case the loop
488581ad6265SDimitry Andric // requires prediction (decided later), but we have already made some
488681ad6265SDimitry Andric // load/store widening decisions.
488781ad6265SDimitry Andric invalidateCostModelingDecisions();
48880b57cec5SDimitry Andric }
4889fe6060f1SDimitry Andric return MaxVF;
48900b57cec5SDimitry Andric }
48910b57cec5SDimitry Andric
4892fe013be4SDimitry Andric /// Convenience function that returns the value of vscale_range iff
4893fe013be4SDimitry Andric /// vscale_range.min == vscale_range.max or otherwise returns the value
4894fe013be4SDimitry Andric /// returned by the corresponding TTI method.
4895fe013be4SDimitry Andric static std::optional<unsigned>
getVScaleForTuning(const Loop * L,const TargetTransformInfo & TTI)4896fe013be4SDimitry Andric getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) {
4897fe013be4SDimitry Andric const Function *Fn = L->getHeader()->getParent();
4898fe013be4SDimitry Andric if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
4899fe013be4SDimitry Andric auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
4900d56accc7SDimitry Andric auto Min = Attr.getVScaleRangeMin();
4901d56accc7SDimitry Andric auto Max = Attr.getVScaleRangeMax();
4902d56accc7SDimitry Andric if (Max && Min == Max)
4903d56accc7SDimitry Andric return Max;
4904d56accc7SDimitry Andric }
4905d56accc7SDimitry Andric
4906d56accc7SDimitry Andric return TTI.getVScaleForTuning();
4907d56accc7SDimitry Andric }
4908d56accc7SDimitry Andric
isMoreProfitable(const VectorizationFactor & A,const VectorizationFactor & B) const4909fe013be4SDimitry Andric bool LoopVectorizationPlanner::isMoreProfitable(
4910fe6060f1SDimitry Andric const VectorizationFactor &A, const VectorizationFactor &B) const {
4911fe6060f1SDimitry Andric InstructionCost CostA = A.Cost;
4912fe6060f1SDimitry Andric InstructionCost CostB = B.Cost;
4913e8d8bef9SDimitry Andric
4914fe013be4SDimitry Andric unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop);
4915fe6060f1SDimitry Andric
4916fe013be4SDimitry Andric if (!A.Width.isScalable() && !B.Width.isScalable() && MaxTripCount) {
4917fe013be4SDimitry Andric // If the trip count is a known (possibly small) constant, the trip count
4918fe013be4SDimitry Andric // will be rounded up to an integer number of iterations under
4919fe013be4SDimitry Andric // FoldTailByMasking. The total cost in that case will be
4920fe013be4SDimitry Andric // VecCost*ceil(TripCount/VF). When not folding the tail, the total
4921fe013be4SDimitry Andric // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
4922fe013be4SDimitry Andric // some extra overheads, but for the purpose of comparing the costs of
4923fe013be4SDimitry Andric // different VFs we can use this to compare the total loop-body cost
4924fe013be4SDimitry Andric // expected after vectorization.
4925fe013be4SDimitry Andric auto GetCostForTC = [MaxTripCount, this](unsigned VF,
4926fe013be4SDimitry Andric InstructionCost VectorCost,
4927fe013be4SDimitry Andric InstructionCost ScalarCost) {
4928fe013be4SDimitry Andric return CM.foldTailByMasking() ? VectorCost * divideCeil(MaxTripCount, VF)
4929fe013be4SDimitry Andric : VectorCost * (MaxTripCount / VF) +
4930fe013be4SDimitry Andric ScalarCost * (MaxTripCount % VF);
4931fe013be4SDimitry Andric };
4932fe013be4SDimitry Andric auto RTCostA = GetCostForTC(A.Width.getFixedValue(), CostA, A.ScalarCost);
4933fe013be4SDimitry Andric auto RTCostB = GetCostForTC(B.Width.getFixedValue(), CostB, B.ScalarCost);
4934fe013be4SDimitry Andric
4935fe6060f1SDimitry Andric return RTCostA < RTCostB;
4936fe6060f1SDimitry Andric }
4937fe6060f1SDimitry Andric
4938349cc55cSDimitry Andric // Improve estimate for the vector width if it is scalable.
4939349cc55cSDimitry Andric unsigned EstimatedWidthA = A.Width.getKnownMinValue();
4940349cc55cSDimitry Andric unsigned EstimatedWidthB = B.Width.getKnownMinValue();
4941fe013be4SDimitry Andric if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) {
4942349cc55cSDimitry Andric if (A.Width.isScalable())
4943bdd1243dSDimitry Andric EstimatedWidthA *= *VScale;
4944349cc55cSDimitry Andric if (B.Width.isScalable())
4945bdd1243dSDimitry Andric EstimatedWidthB *= *VScale;
4946349cc55cSDimitry Andric }
4947349cc55cSDimitry Andric
49480eae32dcSDimitry Andric // Assume vscale may be larger than 1 (or the value being tuned for),
49490eae32dcSDimitry Andric // so that scalable vectorization is slightly favorable over fixed-width
49500eae32dcSDimitry Andric // vectorization.
4951fe6060f1SDimitry Andric if (A.Width.isScalable() && !B.Width.isScalable())
4952349cc55cSDimitry Andric return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
4953fe6060f1SDimitry Andric
4954fe6060f1SDimitry Andric // To avoid the need for FP division:
4955fe6060f1SDimitry Andric // (CostA / A.Width) < (CostB / B.Width)
4956fe6060f1SDimitry Andric // <=> (CostA * B.Width) < (CostB * A.Width)
4957349cc55cSDimitry Andric return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
4958fe6060f1SDimitry Andric }
4959fe6060f1SDimitry Andric
emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,OptimizationRemarkEmitter * ORE,Loop * TheLoop)4960fe013be4SDimitry Andric static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
4961fe013be4SDimitry Andric OptimizationRemarkEmitter *ORE,
4962fe013be4SDimitry Andric Loop *TheLoop) {
4963fe013be4SDimitry Andric if (InvalidCosts.empty())
4964fe013be4SDimitry Andric return;
4965e8d8bef9SDimitry Andric
4966fe6060f1SDimitry Andric // Emit a report of VFs with invalid costs in the loop.
4967fe013be4SDimitry Andric
4968fe6060f1SDimitry Andric // Group the remarks per instruction, keeping the instruction order from
4969fe6060f1SDimitry Andric // InvalidCosts.
4970fe6060f1SDimitry Andric std::map<Instruction *, unsigned> Numbering;
4971fe6060f1SDimitry Andric unsigned I = 0;
4972fe6060f1SDimitry Andric for (auto &Pair : InvalidCosts)
4973fe6060f1SDimitry Andric if (!Numbering.count(Pair.first))
4974fe6060f1SDimitry Andric Numbering[Pair.first] = I++;
4975fe6060f1SDimitry Andric
4976fe6060f1SDimitry Andric // Sort the list, first on instruction(number) then on VF.
4977fe013be4SDimitry Andric sort(InvalidCosts, [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
4978fe6060f1SDimitry Andric if (Numbering[A.first] != Numbering[B.first])
4979fe6060f1SDimitry Andric return Numbering[A.first] < Numbering[B.first];
4980fe6060f1SDimitry Andric ElementCountComparator ECC;
4981fe6060f1SDimitry Andric return ECC(A.second, B.second);
4982fe6060f1SDimitry Andric });
4983fe6060f1SDimitry Andric
4984fe6060f1SDimitry Andric // For a list of ordered instruction-vf pairs:
4985fe6060f1SDimitry Andric // [(load, vf1), (load, vf2), (store, vf1)]
4986fe6060f1SDimitry Andric // Group the instructions together to emit separate remarks for:
4987fe6060f1SDimitry Andric // load (vf1, vf2)
4988fe6060f1SDimitry Andric // store (vf1)
4989fe6060f1SDimitry Andric auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
4990fe6060f1SDimitry Andric auto Subset = ArrayRef<InstructionVFPair>();
4991fe6060f1SDimitry Andric do {
4992fe6060f1SDimitry Andric if (Subset.empty())
4993fe6060f1SDimitry Andric Subset = Tail.take_front(1);
4994fe6060f1SDimitry Andric
4995fe6060f1SDimitry Andric Instruction *I = Subset.front().first;
4996fe6060f1SDimitry Andric
4997fe6060f1SDimitry Andric // If the next instruction is different, or if there are no other pairs,
4998fe6060f1SDimitry Andric // emit a remark for the collated subset. e.g.
4999fe6060f1SDimitry Andric // [(load, vf1), (load, vf2))]
5000fe6060f1SDimitry Andric // to emit:
5001fe6060f1SDimitry Andric // remark: invalid costs for 'load' at VF=(vf, vf2)
5002fe6060f1SDimitry Andric if (Subset == Tail || Tail[Subset.size()].first != I) {
5003fe6060f1SDimitry Andric std::string OutString;
5004fe6060f1SDimitry Andric raw_string_ostream OS(OutString);
5005fe6060f1SDimitry Andric assert(!Subset.empty() && "Unexpected empty range");
5006fe6060f1SDimitry Andric OS << "Instruction with invalid costs prevented vectorization at VF=(";
5007bdd1243dSDimitry Andric for (const auto &Pair : Subset)
5008fe013be4SDimitry Andric OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
5009fe6060f1SDimitry Andric OS << "):";
5010fe6060f1SDimitry Andric if (auto *CI = dyn_cast<CallInst>(I))
5011fe6060f1SDimitry Andric OS << " call to " << CI->getCalledFunction()->getName();
5012fe6060f1SDimitry Andric else
5013fe6060f1SDimitry Andric OS << " " << I->getOpcodeName();
5014fe6060f1SDimitry Andric OS.flush();
5015fe6060f1SDimitry Andric reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
5016fe6060f1SDimitry Andric Tail = Tail.drop_front(Subset.size());
5017fe6060f1SDimitry Andric Subset = {};
5018fe6060f1SDimitry Andric } else
5019fe6060f1SDimitry Andric // Grow the subset by one element
5020fe6060f1SDimitry Andric Subset = Tail.take_front(Subset.size() + 1);
5021fe6060f1SDimitry Andric } while (!Tail.empty());
50220b57cec5SDimitry Andric }
50230b57cec5SDimitry Andric
selectVectorizationFactor(const ElementCountSet & VFCandidates)5024fe013be4SDimitry Andric VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor(
5025fe013be4SDimitry Andric const ElementCountSet &VFCandidates) {
5026fe013be4SDimitry Andric InstructionCost ExpectedCost =
5027fe013be4SDimitry Andric CM.expectedCost(ElementCount::getFixed(1)).first;
5028fe013be4SDimitry Andric LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
5029fe013be4SDimitry Andric assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
5030fe013be4SDimitry Andric assert(VFCandidates.count(ElementCount::getFixed(1)) &&
5031fe013be4SDimitry Andric "Expected Scalar VF to be a candidate");
5032fe013be4SDimitry Andric
5033fe013be4SDimitry Andric const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
5034fe013be4SDimitry Andric ExpectedCost);
5035fe013be4SDimitry Andric VectorizationFactor ChosenFactor = ScalarCost;
5036fe013be4SDimitry Andric
5037fe013be4SDimitry Andric bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
5038fe013be4SDimitry Andric if (ForceVectorization && VFCandidates.size() > 1) {
5039fe013be4SDimitry Andric // Ignore scalar width, because the user explicitly wants vectorization.
5040fe013be4SDimitry Andric // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5041fe013be4SDimitry Andric // evaluation.
5042fe013be4SDimitry Andric ChosenFactor.Cost = InstructionCost::getMax();
5043fe013be4SDimitry Andric }
5044fe013be4SDimitry Andric
5045fe013be4SDimitry Andric SmallVector<InstructionVFPair> InvalidCosts;
5046fe013be4SDimitry Andric for (const auto &i : VFCandidates) {
5047fe013be4SDimitry Andric // The cost for scalar VF=1 is already calculated, so ignore it.
5048fe013be4SDimitry Andric if (i.isScalar())
5049fe013be4SDimitry Andric continue;
5050fe013be4SDimitry Andric
5051fe013be4SDimitry Andric LoopVectorizationCostModel::VectorizationCostTy C =
5052fe013be4SDimitry Andric CM.expectedCost(i, &InvalidCosts);
5053fe013be4SDimitry Andric VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost);
5054fe013be4SDimitry Andric
5055fe013be4SDimitry Andric #ifndef NDEBUG
50566c20abcdSDimitry Andric unsigned AssumedMinimumVscale =
50576c20abcdSDimitry Andric getVScaleForTuning(OrigLoop, TTI).value_or(1);
5058fe013be4SDimitry Andric unsigned Width =
5059fe013be4SDimitry Andric Candidate.Width.isScalable()
5060fe013be4SDimitry Andric ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
5061fe013be4SDimitry Andric : Candidate.Width.getFixedValue();
5062fe013be4SDimitry Andric LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5063fe013be4SDimitry Andric << " costs: " << (Candidate.Cost / Width));
5064fe013be4SDimitry Andric if (i.isScalable())
5065fe013be4SDimitry Andric LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
5066fe013be4SDimitry Andric << AssumedMinimumVscale << ")");
5067fe013be4SDimitry Andric LLVM_DEBUG(dbgs() << ".\n");
5068fe013be4SDimitry Andric #endif
5069fe013be4SDimitry Andric
5070fe013be4SDimitry Andric if (!C.second && !ForceVectorization) {
5071fe013be4SDimitry Andric LLVM_DEBUG(
5072fe013be4SDimitry Andric dbgs() << "LV: Not considering vector loop of width " << i
5073fe013be4SDimitry Andric << " because it will not generate any vector instructions.\n");
5074fe013be4SDimitry Andric continue;
5075fe013be4SDimitry Andric }
5076fe013be4SDimitry Andric
5077fe013be4SDimitry Andric // If profitable add it to ProfitableVF list.
5078fe013be4SDimitry Andric if (isMoreProfitable(Candidate, ScalarCost))
5079fe013be4SDimitry Andric ProfitableVFs.push_back(Candidate);
5080fe013be4SDimitry Andric
5081fe013be4SDimitry Andric if (isMoreProfitable(Candidate, ChosenFactor))
5082fe013be4SDimitry Andric ChosenFactor = Candidate;
5083fe013be4SDimitry Andric }
5084fe013be4SDimitry Andric
5085fe013be4SDimitry Andric emitInvalidCostRemarks(InvalidCosts, ORE, OrigLoop);
5086fe013be4SDimitry Andric
5087fe013be4SDimitry Andric if (!EnableCondStoresVectorization && CM.hasPredStores()) {
5088fe013be4SDimitry Andric reportVectorizationFailure(
5089fe013be4SDimitry Andric "There are conditional stores.",
50908bcb0991SDimitry Andric "store that is conditionally executed prevents vectorization",
5091fe013be4SDimitry Andric "ConditionalStore", ORE, OrigLoop);
5092fe6060f1SDimitry Andric ChosenFactor = ScalarCost;
50930b57cec5SDimitry Andric }
50940b57cec5SDimitry Andric
5095fe6060f1SDimitry Andric LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
5096fcaf7f86SDimitry Andric !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
50970b57cec5SDimitry Andric << "LV: Vectorization seems to be not beneficial, "
50980b57cec5SDimitry Andric << "but was forced by a user.\n");
5099fe6060f1SDimitry Andric LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
5100fe6060f1SDimitry Andric return ChosenFactor;
51010b57cec5SDimitry Andric }
51020b57cec5SDimitry Andric
isCandidateForEpilogueVectorization(ElementCount VF) const5103fe013be4SDimitry Andric bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
5104fe013be4SDimitry Andric ElementCount VF) const {
5105e8d8bef9SDimitry Andric // Cross iteration phis such as reductions need special handling and are
5106e8d8bef9SDimitry Andric // currently unsupported.
5107fe013be4SDimitry Andric if (any_of(OrigLoop->getHeader()->phis(),
5108bdd1243dSDimitry Andric [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
5109e8d8bef9SDimitry Andric return false;
5110e8d8bef9SDimitry Andric
5111e8d8bef9SDimitry Andric // Phis with uses outside of the loop require special handling and are
5112e8d8bef9SDimitry Andric // currently unsupported.
5113bdd1243dSDimitry Andric for (const auto &Entry : Legal->getInductionVars()) {
5114e8d8bef9SDimitry Andric // Look for uses of the value of the induction at the last iteration.
5115fe013be4SDimitry Andric Value *PostInc =
5116fe013be4SDimitry Andric Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch());
5117e8d8bef9SDimitry Andric for (User *U : PostInc->users())
5118fe013be4SDimitry Andric if (!OrigLoop->contains(cast<Instruction>(U)))
5119e8d8bef9SDimitry Andric return false;
5120e8d8bef9SDimitry Andric // Look for uses of penultimate value of the induction.
5121e8d8bef9SDimitry Andric for (User *U : Entry.first->users())
5122fe013be4SDimitry Andric if (!OrigLoop->contains(cast<Instruction>(U)))
5123e8d8bef9SDimitry Andric return false;
5124e8d8bef9SDimitry Andric }
5125e8d8bef9SDimitry Andric
5126fe6060f1SDimitry Andric // Epilogue vectorization code has not been auditted to ensure it handles
5127fe6060f1SDimitry Andric // non-latch exits properly. It may be fine, but it needs auditted and
5128fe6060f1SDimitry Andric // tested.
5129fe013be4SDimitry Andric if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
5130fe6060f1SDimitry Andric return false;
5131fe6060f1SDimitry Andric
5132e8d8bef9SDimitry Andric return true;
5133e8d8bef9SDimitry Andric }
5134e8d8bef9SDimitry Andric
isEpilogueVectorizationProfitable(const ElementCount VF) const5135e8d8bef9SDimitry Andric bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5136e8d8bef9SDimitry Andric const ElementCount VF) const {
5137e8d8bef9SDimitry Andric // FIXME: We need a much better cost-model to take different parameters such
5138e8d8bef9SDimitry Andric // as register pressure, code size increase and cost of extra branches into
5139e8d8bef9SDimitry Andric // account. For now we apply a very crude heuristic and only consider loops
5140e8d8bef9SDimitry Andric // with vectorization factors larger than a certain value.
5141bdd1243dSDimitry Andric
5142bdd1243dSDimitry Andric // Allow the target to opt out entirely.
5143bdd1243dSDimitry Andric if (!TTI.preferEpilogueVectorization())
5144bdd1243dSDimitry Andric return false;
5145bdd1243dSDimitry Andric
5146e8d8bef9SDimitry Andric // We also consider epilogue vectorization unprofitable for targets that don't
5147e8d8bef9SDimitry Andric // consider interleaving beneficial (eg. MVE).
5148fe013be4SDimitry Andric if (TTI.getMaxInterleaveFactor(VF) <= 1)
5149e8d8bef9SDimitry Andric return false;
5150fe013be4SDimitry Andric
5151fe013be4SDimitry Andric unsigned Multiplier = 1;
5152fe013be4SDimitry Andric if (VF.isScalable())
5153fe013be4SDimitry Andric Multiplier = getVScaleForTuning(TheLoop, TTI).value_or(1);
5154fe013be4SDimitry Andric if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF)
5155e8d8bef9SDimitry Andric return true;
5156e8d8bef9SDimitry Andric return false;
5157e8d8bef9SDimitry Andric }
5158e8d8bef9SDimitry Andric
selectEpilogueVectorizationFactor(const ElementCount MainLoopVF,unsigned IC)5159fe013be4SDimitry Andric VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
5160fe013be4SDimitry Andric const ElementCount MainLoopVF, unsigned IC) {
5161e8d8bef9SDimitry Andric VectorizationFactor Result = VectorizationFactor::Disabled();
5162e8d8bef9SDimitry Andric if (!EnableEpilogueVectorization) {
5163fe013be4SDimitry Andric LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
5164e8d8bef9SDimitry Andric return Result;
5165e8d8bef9SDimitry Andric }
5166e8d8bef9SDimitry Andric
5167fe013be4SDimitry Andric if (!CM.isScalarEpilogueAllowed()) {
5168fe013be4SDimitry Andric LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
5169fe013be4SDimitry Andric "epilogue is allowed.\n");
5170e8d8bef9SDimitry Andric return Result;
5171e8d8bef9SDimitry Andric }
5172e8d8bef9SDimitry Andric
5173e8d8bef9SDimitry Andric // Not really a cost consideration, but check for unsupported cases here to
5174e8d8bef9SDimitry Andric // simplify the logic.
5175fe013be4SDimitry Andric if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
5176fe013be4SDimitry Andric LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
5177fe013be4SDimitry Andric "is not a supported candidate.\n");
5178e8d8bef9SDimitry Andric return Result;
5179e8d8bef9SDimitry Andric }
5180e8d8bef9SDimitry Andric
5181e8d8bef9SDimitry Andric if (EpilogueVectorizationForceVF > 1) {
5182fe013be4SDimitry Andric LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
5183349cc55cSDimitry Andric ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
5184fe013be4SDimitry Andric if (hasPlanWithVF(ForcedEC))
518581ad6265SDimitry Andric return {ForcedEC, 0, 0};
5186e8d8bef9SDimitry Andric else {
5187fe013be4SDimitry Andric LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
5188fe013be4SDimitry Andric "viable.\n");
5189e8d8bef9SDimitry Andric return Result;
5190e8d8bef9SDimitry Andric }
5191e8d8bef9SDimitry Andric }
5192e8d8bef9SDimitry Andric
5193fe013be4SDimitry Andric if (OrigLoop->getHeader()->getParent()->hasOptSize() ||
5194fe013be4SDimitry Andric OrigLoop->getHeader()->getParent()->hasMinSize()) {
5195e8d8bef9SDimitry Andric LLVM_DEBUG(
5196fe013be4SDimitry Andric dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
5197e8d8bef9SDimitry Andric return Result;
5198e8d8bef9SDimitry Andric }
5199e8d8bef9SDimitry Andric
5200fe013be4SDimitry Andric if (!CM.isEpilogueVectorizationProfitable(MainLoopVF)) {
5201349cc55cSDimitry Andric LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
5202349cc55cSDimitry Andric "this loop\n");
5203e8d8bef9SDimitry Andric return Result;
5204349cc55cSDimitry Andric }
5205e8d8bef9SDimitry Andric
5206d56accc7SDimitry Andric // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
5207d56accc7SDimitry Andric // the main loop handles 8 lanes per iteration. We could still benefit from
5208d56accc7SDimitry Andric // vectorizing the epilogue loop with VF=4.
5209d56accc7SDimitry Andric ElementCount EstimatedRuntimeVF = MainLoopVF;
5210d56accc7SDimitry Andric if (MainLoopVF.isScalable()) {
5211d56accc7SDimitry Andric EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
5212fe013be4SDimitry Andric if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI))
521381ad6265SDimitry Andric EstimatedRuntimeVF *= *VScale;
5214d56accc7SDimitry Andric }
5215d56accc7SDimitry Andric
5216fe013be4SDimitry Andric ScalarEvolution &SE = *PSE.getSE();
5217fe013be4SDimitry Andric Type *TCType = Legal->getWidestInductionType();
5218fe013be4SDimitry Andric const SCEV *RemainingIterations = nullptr;
5219fe013be4SDimitry Andric for (auto &NextVF : ProfitableVFs) {
5220fe013be4SDimitry Andric // Skip candidate VFs without a corresponding VPlan.
5221fe013be4SDimitry Andric if (!hasPlanWithVF(NextVF.Width))
5222fe013be4SDimitry Andric continue;
5223fe013be4SDimitry Andric
5224fe013be4SDimitry Andric // Skip candidate VFs with widths >= the estimate runtime VF (scalable
5225fe013be4SDimitry Andric // vectors) or the VF of the main loop (fixed vectors).
5226fe013be4SDimitry Andric if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
5227fe013be4SDimitry Andric ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||
5228fe013be4SDimitry Andric ElementCount::isKnownGE(NextVF.Width, MainLoopVF))
5229fe013be4SDimitry Andric continue;
5230fe013be4SDimitry Andric
5231fe013be4SDimitry Andric // If NextVF is greater than the number of remaining iterations, the
5232fe013be4SDimitry Andric // epilogue loop would be dead. Skip such factors.
5233fe013be4SDimitry Andric if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) {
5234fe013be4SDimitry Andric // TODO: extend to support scalable VFs.
5235fe013be4SDimitry Andric if (!RemainingIterations) {
5236fe013be4SDimitry Andric const SCEV *TC = createTripCountSCEV(TCType, PSE, OrigLoop);
5237fe013be4SDimitry Andric RemainingIterations = SE.getURemExpr(
5238fe013be4SDimitry Andric TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC));
5239fe013be4SDimitry Andric }
5240fe013be4SDimitry Andric if (SE.isKnownPredicate(
5241fe013be4SDimitry Andric CmpInst::ICMP_UGT,
5242fe013be4SDimitry Andric SE.getConstant(TCType, NextVF.Width.getKnownMinValue()),
5243fe013be4SDimitry Andric RemainingIterations))
5244fe013be4SDimitry Andric continue;
5245fe013be4SDimitry Andric }
5246fe013be4SDimitry Andric
5247fe013be4SDimitry Andric if (Result.Width.isScalar() || isMoreProfitable(NextVF, Result))
5248e8d8bef9SDimitry Andric Result = NextVF;
5249fe013be4SDimitry Andric }
5250e8d8bef9SDimitry Andric
5251e8d8bef9SDimitry Andric if (Result != VectorizationFactor::Disabled())
5252e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5253fe013be4SDimitry Andric << Result.Width << "\n");
5254e8d8bef9SDimitry Andric return Result;
5255e8d8bef9SDimitry Andric }
5256e8d8bef9SDimitry Andric
52570b57cec5SDimitry Andric std::pair<unsigned, unsigned>
getSmallestAndWidestTypes()52580b57cec5SDimitry Andric LoopVectorizationCostModel::getSmallestAndWidestTypes() {
52590b57cec5SDimitry Andric unsigned MinWidth = -1U;
52600b57cec5SDimitry Andric unsigned MaxWidth = 8;
52610b57cec5SDimitry Andric const DataLayout &DL = TheFunction->getParent()->getDataLayout();
526204eeddc0SDimitry Andric // For in-loop reductions, no element types are added to ElementTypesInLoop
526304eeddc0SDimitry Andric // if there are no loads/stores in the loop. In this case, check through the
526404eeddc0SDimitry Andric // reduction variables to determine the maximum width.
526504eeddc0SDimitry Andric if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
526604eeddc0SDimitry Andric // Reset MaxWidth so that we can find the smallest type used by recurrences
526704eeddc0SDimitry Andric // in the loop.
526804eeddc0SDimitry Andric MaxWidth = -1U;
5269bdd1243dSDimitry Andric for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
527004eeddc0SDimitry Andric const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
527104eeddc0SDimitry Andric // When finding the min width used by the recurrence we need to account
527204eeddc0SDimitry Andric // for casts on the input operands of the recurrence.
527304eeddc0SDimitry Andric MaxWidth = std::min<unsigned>(
527404eeddc0SDimitry Andric MaxWidth, std::min<unsigned>(
527504eeddc0SDimitry Andric RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
527604eeddc0SDimitry Andric RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
527704eeddc0SDimitry Andric }
527804eeddc0SDimitry Andric } else {
5279fe6060f1SDimitry Andric for (Type *T : ElementTypesInLoop) {
5280fe6060f1SDimitry Andric MinWidth = std::min<unsigned>(
5281bdd1243dSDimitry Andric MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
5282fe6060f1SDimitry Andric MaxWidth = std::max<unsigned>(
5283bdd1243dSDimitry Andric MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
5284fe6060f1SDimitry Andric }
528504eeddc0SDimitry Andric }
5286fe6060f1SDimitry Andric return {MinWidth, MaxWidth};
5287fe6060f1SDimitry Andric }
52880b57cec5SDimitry Andric
collectElementTypesForWidening()5289fe6060f1SDimitry Andric void LoopVectorizationCostModel::collectElementTypesForWidening() {
5290fe6060f1SDimitry Andric ElementTypesInLoop.clear();
52910b57cec5SDimitry Andric // For each block.
52920b57cec5SDimitry Andric for (BasicBlock *BB : TheLoop->blocks()) {
52930b57cec5SDimitry Andric // For each instruction in the loop.
52940b57cec5SDimitry Andric for (Instruction &I : BB->instructionsWithoutDebug()) {
52950b57cec5SDimitry Andric Type *T = I.getType();
52960b57cec5SDimitry Andric
52970b57cec5SDimitry Andric // Skip ignored values.
52985ffd83dbSDimitry Andric if (ValuesToIgnore.count(&I))
52990b57cec5SDimitry Andric continue;
53000b57cec5SDimitry Andric
53010b57cec5SDimitry Andric // Only examine Loads, Stores and PHINodes.
53020b57cec5SDimitry Andric if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
53030b57cec5SDimitry Andric continue;
53040b57cec5SDimitry Andric
53050b57cec5SDimitry Andric // Examine PHI nodes that are reduction variables. Update the type to
53060b57cec5SDimitry Andric // account for the recurrence type.
53070b57cec5SDimitry Andric if (auto *PN = dyn_cast<PHINode>(&I)) {
53080b57cec5SDimitry Andric if (!Legal->isReductionVariable(PN))
53090b57cec5SDimitry Andric continue;
53100eae32dcSDimitry Andric const RecurrenceDescriptor &RdxDesc =
53110eae32dcSDimitry Andric Legal->getReductionVars().find(PN)->second;
5312fe6060f1SDimitry Andric if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
5313e8d8bef9SDimitry Andric TTI.preferInLoopReduction(RdxDesc.getOpcode(),
5314e8d8bef9SDimitry Andric RdxDesc.getRecurrenceType(),
5315e8d8bef9SDimitry Andric TargetTransformInfo::ReductionFlags()))
5316e8d8bef9SDimitry Andric continue;
53170b57cec5SDimitry Andric T = RdxDesc.getRecurrenceType();
53180b57cec5SDimitry Andric }
53190b57cec5SDimitry Andric
53200b57cec5SDimitry Andric // Examine the stored values.
53210b57cec5SDimitry Andric if (auto *ST = dyn_cast<StoreInst>(&I))
53220b57cec5SDimitry Andric T = ST->getValueOperand()->getType();
53230b57cec5SDimitry Andric
532404eeddc0SDimitry Andric assert(T->isSized() &&
532504eeddc0SDimitry Andric "Expected the load/store/recurrence type to be sized");
53260b57cec5SDimitry Andric
5327fe6060f1SDimitry Andric ElementTypesInLoop.insert(T);
53280b57cec5SDimitry Andric }
53290b57cec5SDimitry Andric }
53300b57cec5SDimitry Andric }
53310b57cec5SDimitry Andric
5332bdd1243dSDimitry Andric unsigned
selectInterleaveCount(ElementCount VF,InstructionCost LoopCost)5333bdd1243dSDimitry Andric LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5334bdd1243dSDimitry Andric InstructionCost LoopCost) {
53350b57cec5SDimitry Andric // -- The interleave heuristics --
53360b57cec5SDimitry Andric // We interleave the loop in order to expose ILP and reduce the loop overhead.
53370b57cec5SDimitry Andric // There are many micro-architectural considerations that we can't predict
53380b57cec5SDimitry Andric // at this level. For example, frontend pressure (on decode or fetch) due to
53390b57cec5SDimitry Andric // code size, or the number and capabilities of the execution ports.
53400b57cec5SDimitry Andric //
53410b57cec5SDimitry Andric // We use the following heuristics to select the interleave count:
53420b57cec5SDimitry Andric // 1. If the code has reductions, then we interleave to break the cross
53430b57cec5SDimitry Andric // iteration dependency.
53440b57cec5SDimitry Andric // 2. If the loop is really small, then we interleave to reduce the loop
53450b57cec5SDimitry Andric // overhead.
53460b57cec5SDimitry Andric // 3. We don't interleave if we think that we will spill registers to memory
53470b57cec5SDimitry Andric // due to the increased register pressure.
53480b57cec5SDimitry Andric
53498bcb0991SDimitry Andric if (!isScalarEpilogueAllowed())
53500b57cec5SDimitry Andric return 1;
53510b57cec5SDimitry Andric
53520b57cec5SDimitry Andric // We used the distance for the interleave count.
5353fe013be4SDimitry Andric if (!Legal->isSafeForAnyVectorWidth())
53540b57cec5SDimitry Andric return 1;
53550b57cec5SDimitry Andric
5356480093f4SDimitry Andric auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5357e8d8bef9SDimitry Andric const bool HasReductions = !Legal->getReductionVars().empty();
5358e8d8bef9SDimitry Andric // Do not interleave loops with a relatively small known or estimated trip
5359e8d8bef9SDimitry Andric // count. But we will interleave when InterleaveSmallLoopScalarReduction is
5360e8d8bef9SDimitry Andric // enabled, and the code has scalar reductions(HasReductions && VF = 1),
5361e8d8bef9SDimitry Andric // because with the above conditions interleaving can expose ILP and break
5362e8d8bef9SDimitry Andric // cross iteration dependences for reductions.
5363e8d8bef9SDimitry Andric if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
5364e8d8bef9SDimitry Andric !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
53650b57cec5SDimitry Andric return 1;
53660b57cec5SDimitry Andric
53673a9a9c0cSDimitry Andric // If we did not calculate the cost for VF (because the user selected the VF)
53683a9a9c0cSDimitry Andric // then we calculate the cost of VF here.
53693a9a9c0cSDimitry Andric if (LoopCost == 0) {
5370bdd1243dSDimitry Andric LoopCost = expectedCost(VF).first;
5371bdd1243dSDimitry Andric assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
53723a9a9c0cSDimitry Andric
53733a9a9c0cSDimitry Andric // Loop body is free and there is no need for interleaving.
53743a9a9c0cSDimitry Andric if (LoopCost == 0)
53753a9a9c0cSDimitry Andric return 1;
53763a9a9c0cSDimitry Andric }
53773a9a9c0cSDimitry Andric
53780b57cec5SDimitry Andric RegisterUsage R = calculateRegisterUsage({VF})[0];
53790b57cec5SDimitry Andric // We divide by these constants so assume that we have at least one
53800b57cec5SDimitry Andric // instruction that uses at least one register.
53818bcb0991SDimitry Andric for (auto& pair : R.MaxLocalUsers) {
53828bcb0991SDimitry Andric pair.second = std::max(pair.second, 1U);
53838bcb0991SDimitry Andric }
53840b57cec5SDimitry Andric
53850b57cec5SDimitry Andric // We calculate the interleave count using the following formula.
53860b57cec5SDimitry Andric // Subtract the number of loop invariants from the number of available
53870b57cec5SDimitry Andric // registers. These registers are used by all of the interleaved instances.
53880b57cec5SDimitry Andric // Next, divide the remaining registers by the number of registers that is
53890b57cec5SDimitry Andric // required by the loop, in order to estimate how many parallel instances
53900b57cec5SDimitry Andric // fit without causing spills. All of this is rounded down if necessary to be
53910b57cec5SDimitry Andric // a power of two. We want power of two interleave count to simplify any
53920b57cec5SDimitry Andric // addressing operations or alignment considerations.
53930b57cec5SDimitry Andric // We also want power of two interleave counts to ensure that the induction
53940b57cec5SDimitry Andric // variable of the vector loop wraps to zero, when tail is folded by masking;
53950b57cec5SDimitry Andric // this currently happens when OptForSize, in which case IC is set to 1 above.
53968bcb0991SDimitry Andric unsigned IC = UINT_MAX;
53970b57cec5SDimitry Andric
53988bcb0991SDimitry Andric for (auto& pair : R.MaxLocalUsers) {
53998bcb0991SDimitry Andric unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
54008bcb0991SDimitry Andric LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
54018bcb0991SDimitry Andric << " registers of "
54028bcb0991SDimitry Andric << TTI.getRegisterClassName(pair.first) << " register class\n");
5403e8d8bef9SDimitry Andric if (VF.isScalar()) {
54048bcb0991SDimitry Andric if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
54058bcb0991SDimitry Andric TargetNumRegisters = ForceTargetNumScalarRegs;
54068bcb0991SDimitry Andric } else {
54078bcb0991SDimitry Andric if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
54088bcb0991SDimitry Andric TargetNumRegisters = ForceTargetNumVectorRegs;
54098bcb0991SDimitry Andric }
54108bcb0991SDimitry Andric unsigned MaxLocalUsers = pair.second;
54118bcb0991SDimitry Andric unsigned LoopInvariantRegs = 0;
54128bcb0991SDimitry Andric if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
54138bcb0991SDimitry Andric LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
54148bcb0991SDimitry Andric
5415fe013be4SDimitry Andric unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /
5416fe013be4SDimitry Andric MaxLocalUsers);
54170b57cec5SDimitry Andric // Don't count the induction variable as interleaved.
54188bcb0991SDimitry Andric if (EnableIndVarRegisterHeur) {
5419fe013be4SDimitry Andric TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) /
54208bcb0991SDimitry Andric std::max(1U, (MaxLocalUsers - 1)));
54218bcb0991SDimitry Andric }
54228bcb0991SDimitry Andric
54238bcb0991SDimitry Andric IC = std::min(IC, TmpIC);
54248bcb0991SDimitry Andric }
54250b57cec5SDimitry Andric
54260b57cec5SDimitry Andric // Clamp the interleave ranges to reasonable counts.
5427fe013be4SDimitry Andric unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
54280b57cec5SDimitry Andric
54290b57cec5SDimitry Andric // Check if the user has overridden the max.
5430e8d8bef9SDimitry Andric if (VF.isScalar()) {
54310b57cec5SDimitry Andric if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
54320b57cec5SDimitry Andric MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
54330b57cec5SDimitry Andric } else {
54340b57cec5SDimitry Andric if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
54350b57cec5SDimitry Andric MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
54360b57cec5SDimitry Andric }
54370b57cec5SDimitry Andric
5438cdc20ff6SDimitry Andric unsigned EstimatedVF = VF.getKnownMinValue();
5439cdc20ff6SDimitry Andric if (VF.isScalable()) {
5440cdc20ff6SDimitry Andric if (std::optional<unsigned> VScale = getVScaleForTuning(TheLoop, TTI))
5441cdc20ff6SDimitry Andric EstimatedVF *= *VScale;
5442cdc20ff6SDimitry Andric }
5443cdc20ff6SDimitry Andric assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
5444cdc20ff6SDimitry Andric
5445cdc20ff6SDimitry Andric unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5446cdc20ff6SDimitry Andric if (KnownTC) {
5447cdc20ff6SDimitry Andric // If trip count is known we select between two prospective ICs, where
5448cdc20ff6SDimitry Andric // 1) the aggressive IC is capped by the trip count divided by VF
5449cdc20ff6SDimitry Andric // 2) the conservative IC is capped by the trip count divided by (VF * 2)
5450cdc20ff6SDimitry Andric // The final IC is selected in a way that the epilogue loop trip count is
5451cdc20ff6SDimitry Andric // minimized while maximizing the IC itself, so that we either run the
5452cdc20ff6SDimitry Andric // vector loop at least once if it generates a small epilogue loop, or else
5453cdc20ff6SDimitry Andric // we run the vector loop at least twice.
5454cdc20ff6SDimitry Andric
5455cdc20ff6SDimitry Andric unsigned InterleaveCountUB = bit_floor(
5456cdc20ff6SDimitry Andric std::max(1u, std::min(KnownTC / EstimatedVF, MaxInterleaveCount)));
5457cdc20ff6SDimitry Andric unsigned InterleaveCountLB = bit_floor(std::max(
5458cdc20ff6SDimitry Andric 1u, std::min(KnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
5459cdc20ff6SDimitry Andric MaxInterleaveCount = InterleaveCountLB;
5460cdc20ff6SDimitry Andric
5461cdc20ff6SDimitry Andric if (InterleaveCountUB != InterleaveCountLB) {
5462cdc20ff6SDimitry Andric unsigned TailTripCountUB = (KnownTC % (EstimatedVF * InterleaveCountUB));
5463cdc20ff6SDimitry Andric unsigned TailTripCountLB = (KnownTC % (EstimatedVF * InterleaveCountLB));
5464cdc20ff6SDimitry Andric // If both produce same scalar tail, maximize the IC to do the same work
5465cdc20ff6SDimitry Andric // in fewer vector loop iterations
5466cdc20ff6SDimitry Andric if (TailTripCountUB == TailTripCountLB)
5467cdc20ff6SDimitry Andric MaxInterleaveCount = InterleaveCountUB;
5468cdc20ff6SDimitry Andric }
5469cdc20ff6SDimitry Andric } else if (BestKnownTC) {
5470cdc20ff6SDimitry Andric // If trip count is an estimated compile time constant, limit the
5471cdc20ff6SDimitry Andric // IC to be capped by the trip count divided by VF * 2, such that the vector
5472cdc20ff6SDimitry Andric // loop runs at least twice to make interleaving seem profitable when there
5473cdc20ff6SDimitry Andric // is an epilogue loop present. Since exact Trip count is not known we
5474cdc20ff6SDimitry Andric // choose to be conservative in our IC estimate.
5475cdc20ff6SDimitry Andric MaxInterleaveCount = bit_floor(std::max(
5476cdc20ff6SDimitry Andric 1u, std::min(*BestKnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
54778bcb0991SDimitry Andric }
54788bcb0991SDimitry Andric
5479e8d8bef9SDimitry Andric assert(MaxInterleaveCount > 0 &&
5480e8d8bef9SDimitry Andric "Maximum interleave count must be greater than 0");
54810b57cec5SDimitry Andric
54820b57cec5SDimitry Andric // Clamp the calculated IC to be between the 1 and the max interleave count
54838bcb0991SDimitry Andric // that the target and trip count allows.
54840b57cec5SDimitry Andric if (IC > MaxInterleaveCount)
54850b57cec5SDimitry Andric IC = MaxInterleaveCount;
5486e8d8bef9SDimitry Andric else
5487e8d8bef9SDimitry Andric // Make sure IC is greater than 0.
5488e8d8bef9SDimitry Andric IC = std::max(1u, IC);
5489e8d8bef9SDimitry Andric
5490e8d8bef9SDimitry Andric assert(IC > 0 && "Interleave count must be greater than 0.");
5491e8d8bef9SDimitry Andric
54920b57cec5SDimitry Andric // Interleave if we vectorized this loop and there is a reduction that could
54930b57cec5SDimitry Andric // benefit from interleaving.
5494e8d8bef9SDimitry Andric if (VF.isVector() && HasReductions) {
54950b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
54960b57cec5SDimitry Andric return IC;
54970b57cec5SDimitry Andric }
54980b57cec5SDimitry Andric
549981ad6265SDimitry Andric // For any scalar loop that either requires runtime checks or predication we
550081ad6265SDimitry Andric // are better off leaving this to the unroller. Note that if we've already
550181ad6265SDimitry Andric // vectorized the loop we will have done the runtime check and so interleaving
550281ad6265SDimitry Andric // won't require further checks.
550381ad6265SDimitry Andric bool ScalarInterleavingRequiresPredication =
550481ad6265SDimitry Andric (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
550581ad6265SDimitry Andric return Legal->blockNeedsPredication(BB);
550681ad6265SDimitry Andric }));
550781ad6265SDimitry Andric bool ScalarInterleavingRequiresRuntimePointerCheck =
5508e8d8bef9SDimitry Andric (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
55090b57cec5SDimitry Andric
55100b57cec5SDimitry Andric // We want to interleave small loops in order to reduce the loop overhead and
55110b57cec5SDimitry Andric // potentially expose ILP opportunities.
5512e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5513e8d8bef9SDimitry Andric << "LV: IC is " << IC << '\n'
5514e8d8bef9SDimitry Andric << "LV: VF is " << VF << '\n');
5515e8d8bef9SDimitry Andric const bool AggressivelyInterleaveReductions =
5516e8d8bef9SDimitry Andric TTI.enableAggressiveInterleaving(HasReductions);
551781ad6265SDimitry Andric if (!ScalarInterleavingRequiresRuntimePointerCheck &&
551881ad6265SDimitry Andric !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
55190b57cec5SDimitry Andric // We assume that the cost overhead is 1 and we use the cost model
55200b57cec5SDimitry Andric // to estimate the cost of the loop and interleave until the cost of the
55210b57cec5SDimitry Andric // loop overhead is about 5% of the cost of the loop.
5522fe013be4SDimitry Andric unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>(
5523fe013be4SDimitry Andric SmallLoopCost / *LoopCost.getValue()));
55240b57cec5SDimitry Andric
55250b57cec5SDimitry Andric // Interleave until store/load ports (estimated by max interleave count) are
55260b57cec5SDimitry Andric // saturated.
55270b57cec5SDimitry Andric unsigned NumStores = Legal->getNumStores();
55280b57cec5SDimitry Andric unsigned NumLoads = Legal->getNumLoads();
55290b57cec5SDimitry Andric unsigned StoresIC = IC / (NumStores ? NumStores : 1);
55300b57cec5SDimitry Andric unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
55310b57cec5SDimitry Andric
5532349cc55cSDimitry Andric // There is little point in interleaving for reductions containing selects
5533349cc55cSDimitry Andric // and compares when VF=1 since it may just create more overhead than it's
5534349cc55cSDimitry Andric // worth for loops with small trip counts. This is because we still have to
5535349cc55cSDimitry Andric // do the final reduction after the loop.
5536349cc55cSDimitry Andric bool HasSelectCmpReductions =
5537349cc55cSDimitry Andric HasReductions &&
5538349cc55cSDimitry Andric any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5539349cc55cSDimitry Andric const RecurrenceDescriptor &RdxDesc = Reduction.second;
5540c9157d92SDimitry Andric return RecurrenceDescriptor::isAnyOfRecurrenceKind(
5541349cc55cSDimitry Andric RdxDesc.getRecurrenceKind());
5542349cc55cSDimitry Andric });
5543349cc55cSDimitry Andric if (HasSelectCmpReductions) {
5544349cc55cSDimitry Andric LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5545349cc55cSDimitry Andric return 1;
5546349cc55cSDimitry Andric }
5547349cc55cSDimitry Andric
55480b57cec5SDimitry Andric // If we have a scalar reduction (vector reductions are already dealt with
55490b57cec5SDimitry Andric // by this point), we can increase the critical path length if the loop
5550fe6060f1SDimitry Andric // we're interleaving is inside another loop. For tree-wise reductions
5551fe6060f1SDimitry Andric // set the limit to 2, and for ordered reductions it's best to disable
5552fe6060f1SDimitry Andric // interleaving entirely.
5553e8d8bef9SDimitry Andric if (HasReductions && TheLoop->getLoopDepth() > 1) {
5554fe6060f1SDimitry Andric bool HasOrderedReductions =
5555fe6060f1SDimitry Andric any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5556fe6060f1SDimitry Andric const RecurrenceDescriptor &RdxDesc = Reduction.second;
5557fe6060f1SDimitry Andric return RdxDesc.isOrdered();
5558fe6060f1SDimitry Andric });
5559fe6060f1SDimitry Andric if (HasOrderedReductions) {
5560fe6060f1SDimitry Andric LLVM_DEBUG(
5561fe6060f1SDimitry Andric dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5562fe6060f1SDimitry Andric return 1;
5563fe6060f1SDimitry Andric }
5564fe6060f1SDimitry Andric
55650b57cec5SDimitry Andric unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
55660b57cec5SDimitry Andric SmallIC = std::min(SmallIC, F);
55670b57cec5SDimitry Andric StoresIC = std::min(StoresIC, F);
55680b57cec5SDimitry Andric LoadsIC = std::min(LoadsIC, F);
55690b57cec5SDimitry Andric }
55700b57cec5SDimitry Andric
55710b57cec5SDimitry Andric if (EnableLoadStoreRuntimeInterleave &&
55720b57cec5SDimitry Andric std::max(StoresIC, LoadsIC) > SmallIC) {
55730b57cec5SDimitry Andric LLVM_DEBUG(
55740b57cec5SDimitry Andric dbgs() << "LV: Interleaving to saturate store or load ports.\n");
55750b57cec5SDimitry Andric return std::max(StoresIC, LoadsIC);
55760b57cec5SDimitry Andric }
55770b57cec5SDimitry Andric
5578e8d8bef9SDimitry Andric // If there are scalar reductions and TTI has enabled aggressive
5579e8d8bef9SDimitry Andric // interleaving for reductions, we will interleave to expose ILP.
5580e8d8bef9SDimitry Andric if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
5581e8d8bef9SDimitry Andric AggressivelyInterleaveReductions) {
5582e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5583e8d8bef9SDimitry Andric // Interleave no less than SmallIC but not as aggressive as the normal IC
5584e8d8bef9SDimitry Andric // to satisfy the rare situation when resources are too limited.
5585e8d8bef9SDimitry Andric return std::max(IC / 2, SmallIC);
5586e8d8bef9SDimitry Andric } else {
55870b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
55880b57cec5SDimitry Andric return SmallIC;
55890b57cec5SDimitry Andric }
5590e8d8bef9SDimitry Andric }
55910b57cec5SDimitry Andric
55920b57cec5SDimitry Andric // Interleave if this is a large loop (small loops are already dealt with by
55930b57cec5SDimitry Andric // this point) that could benefit from interleaving.
5594e8d8bef9SDimitry Andric if (AggressivelyInterleaveReductions) {
55950b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
55960b57cec5SDimitry Andric return IC;
55970b57cec5SDimitry Andric }
55980b57cec5SDimitry Andric
55990b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
56000b57cec5SDimitry Andric return 1;
56010b57cec5SDimitry Andric }
56020b57cec5SDimitry Andric
56030b57cec5SDimitry Andric SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
calculateRegisterUsage(ArrayRef<ElementCount> VFs)5604e8d8bef9SDimitry Andric LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
56050b57cec5SDimitry Andric // This function calculates the register usage by measuring the highest number
56060b57cec5SDimitry Andric // of values that are alive at a single location. Obviously, this is a very
56070b57cec5SDimitry Andric // rough estimation. We scan the loop in a topological order in order and
56080b57cec5SDimitry Andric // assign a number to each instruction. We use RPO to ensure that defs are
56090b57cec5SDimitry Andric // met before their users. We assume that each instruction that has in-loop
56100b57cec5SDimitry Andric // users starts an interval. We record every time that an in-loop value is
56110b57cec5SDimitry Andric // used, so we have a list of the first and last occurrences of each
56120b57cec5SDimitry Andric // instruction. Next, we transpose this data structure into a multi map that
56130b57cec5SDimitry Andric // holds the list of intervals that *end* at a specific location. This multi
56140b57cec5SDimitry Andric // map allows us to perform a linear search. We scan the instructions linearly
56150b57cec5SDimitry Andric // and record each time that a new interval starts, by placing it in a set.
56160b57cec5SDimitry Andric // If we find this value in the multi-map then we remove it from the set.
56170b57cec5SDimitry Andric // The max register usage is the maximum size of the set.
56180b57cec5SDimitry Andric // We also search for instructions that are defined outside the loop, but are
56190b57cec5SDimitry Andric // used inside the loop. We need this number separately from the max-interval
56200b57cec5SDimitry Andric // usage number because when we unroll, loop-invariant values do not take
56210b57cec5SDimitry Andric // more register.
56220b57cec5SDimitry Andric LoopBlocksDFS DFS(TheLoop);
56230b57cec5SDimitry Andric DFS.perform(LI);
56240b57cec5SDimitry Andric
56250b57cec5SDimitry Andric RegisterUsage RU;
56260b57cec5SDimitry Andric
56270b57cec5SDimitry Andric // Each 'key' in the map opens a new interval. The values
56280b57cec5SDimitry Andric // of the map are the index of the 'last seen' usage of the
56290b57cec5SDimitry Andric // instruction that is the key.
56300b57cec5SDimitry Andric using IntervalMap = DenseMap<Instruction *, unsigned>;
56310b57cec5SDimitry Andric
56320b57cec5SDimitry Andric // Maps instruction to its index.
56330b57cec5SDimitry Andric SmallVector<Instruction *, 64> IdxToInstr;
56340b57cec5SDimitry Andric // Marks the end of each interval.
56350b57cec5SDimitry Andric IntervalMap EndPoint;
56360b57cec5SDimitry Andric // Saves the list of instruction indices that are used in the loop.
56370b57cec5SDimitry Andric SmallPtrSet<Instruction *, 8> Ends;
5638bdd1243dSDimitry Andric // Saves the list of values that are used in the loop but are defined outside
5639bdd1243dSDimitry Andric // the loop (not including non-instruction values such as arguments and
5640bdd1243dSDimitry Andric // constants).
5641fe013be4SDimitry Andric SmallSetVector<Instruction *, 8> LoopInvariants;
56420b57cec5SDimitry Andric
56430b57cec5SDimitry Andric for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
56440b57cec5SDimitry Andric for (Instruction &I : BB->instructionsWithoutDebug()) {
56450b57cec5SDimitry Andric IdxToInstr.push_back(&I);
56460b57cec5SDimitry Andric
56470b57cec5SDimitry Andric // Save the end location of each USE.
56480b57cec5SDimitry Andric for (Value *U : I.operands()) {
56490b57cec5SDimitry Andric auto *Instr = dyn_cast<Instruction>(U);
56500b57cec5SDimitry Andric
56510b57cec5SDimitry Andric // Ignore non-instruction values such as arguments, constants, etc.
5652bdd1243dSDimitry Andric // FIXME: Might need some motivation why these values are ignored. If
5653bdd1243dSDimitry Andric // for example an argument is used inside the loop it will increase the
5654bdd1243dSDimitry Andric // register pressure (so shouldn't we add it to LoopInvariants).
56550b57cec5SDimitry Andric if (!Instr)
56560b57cec5SDimitry Andric continue;
56570b57cec5SDimitry Andric
56580b57cec5SDimitry Andric // If this instruction is outside the loop then record it and continue.
56590b57cec5SDimitry Andric if (!TheLoop->contains(Instr)) {
56600b57cec5SDimitry Andric LoopInvariants.insert(Instr);
56610b57cec5SDimitry Andric continue;
56620b57cec5SDimitry Andric }
56630b57cec5SDimitry Andric
56640b57cec5SDimitry Andric // Overwrite previous end points.
56650b57cec5SDimitry Andric EndPoint[Instr] = IdxToInstr.size();
56660b57cec5SDimitry Andric Ends.insert(Instr);
56670b57cec5SDimitry Andric }
56680b57cec5SDimitry Andric }
56690b57cec5SDimitry Andric }
56700b57cec5SDimitry Andric
56710b57cec5SDimitry Andric // Saves the list of intervals that end with the index in 'key'.
56720b57cec5SDimitry Andric using InstrList = SmallVector<Instruction *, 2>;
56730b57cec5SDimitry Andric DenseMap<unsigned, InstrList> TransposeEnds;
56740b57cec5SDimitry Andric
56750b57cec5SDimitry Andric // Transpose the EndPoints to a list of values that end at each index.
56760b57cec5SDimitry Andric for (auto &Interval : EndPoint)
56770b57cec5SDimitry Andric TransposeEnds[Interval.second].push_back(Interval.first);
56780b57cec5SDimitry Andric
56790b57cec5SDimitry Andric SmallPtrSet<Instruction *, 8> OpenIntervals;
56800b57cec5SDimitry Andric SmallVector<RegisterUsage, 8> RUs(VFs.size());
56818bcb0991SDimitry Andric SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
56820b57cec5SDimitry Andric
56830b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
56840b57cec5SDimitry Andric
5685753f127fSDimitry Andric const auto &TTICapture = TTI;
5686753f127fSDimitry Andric auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
5687e8d8bef9SDimitry Andric if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
5688fe6060f1SDimitry Andric return 0;
5689753f127fSDimitry Andric return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
56900b57cec5SDimitry Andric };
56910b57cec5SDimitry Andric
56920b57cec5SDimitry Andric for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
56930b57cec5SDimitry Andric Instruction *I = IdxToInstr[i];
56940b57cec5SDimitry Andric
56950b57cec5SDimitry Andric // Remove all of the instructions that end at this location.
56960b57cec5SDimitry Andric InstrList &List = TransposeEnds[i];
56970b57cec5SDimitry Andric for (Instruction *ToRemove : List)
56980b57cec5SDimitry Andric OpenIntervals.erase(ToRemove);
56990b57cec5SDimitry Andric
57000b57cec5SDimitry Andric // Ignore instructions that are never used within the loop.
57015ffd83dbSDimitry Andric if (!Ends.count(I))
57020b57cec5SDimitry Andric continue;
57030b57cec5SDimitry Andric
57040b57cec5SDimitry Andric // Skip ignored values.
57055ffd83dbSDimitry Andric if (ValuesToIgnore.count(I))
57060b57cec5SDimitry Andric continue;
57070b57cec5SDimitry Andric
5708c9157d92SDimitry Andric collectInLoopReductions();
5709c9157d92SDimitry Andric
57100b57cec5SDimitry Andric // For each VF find the maximum usage of registers.
57110b57cec5SDimitry Andric for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5712bdd1243dSDimitry Andric // Count the number of registers used, per register class, given all open
5713bdd1243dSDimitry Andric // intervals.
5714bdd1243dSDimitry Andric // Note that elements in this SmallMapVector will be default constructed
5715bdd1243dSDimitry Andric // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
5716bdd1243dSDimitry Andric // there is no previous entry for ClassID.
57178bcb0991SDimitry Andric SmallMapVector<unsigned, unsigned, 4> RegUsage;
57188bcb0991SDimitry Andric
5719e8d8bef9SDimitry Andric if (VFs[j].isScalar()) {
5720bdd1243dSDimitry Andric for (auto *Inst : OpenIntervals) {
5721bdd1243dSDimitry Andric unsigned ClassID =
5722bdd1243dSDimitry Andric TTI.getRegisterClassForType(false, Inst->getType());
5723bdd1243dSDimitry Andric // FIXME: The target might use more than one register for the type
5724bdd1243dSDimitry Andric // even in the scalar case.
57258bcb0991SDimitry Andric RegUsage[ClassID] += 1;
57268bcb0991SDimitry Andric }
57278bcb0991SDimitry Andric } else {
57288bcb0991SDimitry Andric collectUniformsAndScalars(VFs[j]);
5729bdd1243dSDimitry Andric for (auto *Inst : OpenIntervals) {
57300b57cec5SDimitry Andric // Skip ignored values for VF > 1.
57315ffd83dbSDimitry Andric if (VecValuesToIgnore.count(Inst))
57320b57cec5SDimitry Andric continue;
57338bcb0991SDimitry Andric if (isScalarAfterVectorization(Inst, VFs[j])) {
5734bdd1243dSDimitry Andric unsigned ClassID =
5735bdd1243dSDimitry Andric TTI.getRegisterClassForType(false, Inst->getType());
5736bdd1243dSDimitry Andric // FIXME: The target might use more than one register for the type
5737bdd1243dSDimitry Andric // even in the scalar case.
57388bcb0991SDimitry Andric RegUsage[ClassID] += 1;
57398bcb0991SDimitry Andric } else {
5740bdd1243dSDimitry Andric unsigned ClassID =
5741bdd1243dSDimitry Andric TTI.getRegisterClassForType(true, Inst->getType());
57428bcb0991SDimitry Andric RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
57430b57cec5SDimitry Andric }
57448bcb0991SDimitry Andric }
57458bcb0991SDimitry Andric }
57468bcb0991SDimitry Andric
57478bcb0991SDimitry Andric for (auto& pair : RegUsage) {
5748bdd1243dSDimitry Andric auto &Entry = MaxUsages[j][pair.first];
5749bdd1243dSDimitry Andric Entry = std::max(Entry, pair.second);
57508bcb0991SDimitry Andric }
57510b57cec5SDimitry Andric }
57520b57cec5SDimitry Andric
57530b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
57540b57cec5SDimitry Andric << OpenIntervals.size() << '\n');
57550b57cec5SDimitry Andric
57560b57cec5SDimitry Andric // Add the current instruction to the list of open intervals.
57570b57cec5SDimitry Andric OpenIntervals.insert(I);
57580b57cec5SDimitry Andric }
57590b57cec5SDimitry Andric
57600b57cec5SDimitry Andric for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5761bdd1243dSDimitry Andric // Note that elements in this SmallMapVector will be default constructed
5762bdd1243dSDimitry Andric // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5763bdd1243dSDimitry Andric // there is no previous entry for ClassID.
57648bcb0991SDimitry Andric SmallMapVector<unsigned, unsigned, 4> Invariant;
57658bcb0991SDimitry Andric
5766bdd1243dSDimitry Andric for (auto *Inst : LoopInvariants) {
5767bdd1243dSDimitry Andric // FIXME: The target might use more than one register for the type
5768bdd1243dSDimitry Andric // even in the scalar case.
57691ac55f4cSDimitry Andric bool IsScalar = all_of(Inst->users(), [&](User *U) {
57701ac55f4cSDimitry Andric auto *I = cast<Instruction>(U);
57711ac55f4cSDimitry Andric return TheLoop != LI->getLoopFor(I->getParent()) ||
57721ac55f4cSDimitry Andric isScalarAfterVectorization(I, VFs[i]);
57731ac55f4cSDimitry Andric });
57741ac55f4cSDimitry Andric
57751ac55f4cSDimitry Andric ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[i];
5776e8d8bef9SDimitry Andric unsigned ClassID =
57771ac55f4cSDimitry Andric TTI.getRegisterClassForType(VF.isVector(), Inst->getType());
57781ac55f4cSDimitry Andric Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);
57790b57cec5SDimitry Andric }
57800b57cec5SDimitry Andric
57818bcb0991SDimitry Andric LLVM_DEBUG({
57828bcb0991SDimitry Andric dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
57838bcb0991SDimitry Andric dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
57848bcb0991SDimitry Andric << " item\n";
57858bcb0991SDimitry Andric for (const auto &pair : MaxUsages[i]) {
57868bcb0991SDimitry Andric dbgs() << "LV(REG): RegisterClass: "
57878bcb0991SDimitry Andric << TTI.getRegisterClassName(pair.first) << ", " << pair.second
57888bcb0991SDimitry Andric << " registers\n";
57898bcb0991SDimitry Andric }
57908bcb0991SDimitry Andric dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
57918bcb0991SDimitry Andric << " item\n";
57928bcb0991SDimitry Andric for (const auto &pair : Invariant) {
57938bcb0991SDimitry Andric dbgs() << "LV(REG): RegisterClass: "
57948bcb0991SDimitry Andric << TTI.getRegisterClassName(pair.first) << ", " << pair.second
57958bcb0991SDimitry Andric << " registers\n";
57968bcb0991SDimitry Andric }
57978bcb0991SDimitry Andric });
57980b57cec5SDimitry Andric
57990b57cec5SDimitry Andric RU.LoopInvariantRegs = Invariant;
58000b57cec5SDimitry Andric RU.MaxLocalUsers = MaxUsages[i];
58010b57cec5SDimitry Andric RUs[i] = RU;
58020b57cec5SDimitry Andric }
58030b57cec5SDimitry Andric
58040b57cec5SDimitry Andric return RUs;
58050b57cec5SDimitry Andric }
58060b57cec5SDimitry Andric
useEmulatedMaskMemRefHack(Instruction * I,ElementCount VF)580704eeddc0SDimitry Andric bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
580804eeddc0SDimitry Andric ElementCount VF) {
58090b57cec5SDimitry Andric // TODO: Cost model for emulated masked load/store is completely
58100b57cec5SDimitry Andric // broken. This hack guides the cost model to use an artificially
58110b57cec5SDimitry Andric // high enough value to practically disable vectorization with such
58120b57cec5SDimitry Andric // operations, except where previously deployed legality hack allowed
58130b57cec5SDimitry Andric // using very low cost values. This is to avoid regressions coming simply
58140b57cec5SDimitry Andric // from moving "masked load/store" check from legality to cost model.
58150b57cec5SDimitry Andric // Masked Load/Gather emulation was previously never allowed.
58160b57cec5SDimitry Andric // Limited number of Masked Store/Scatter emulation was allowed.
5817bdd1243dSDimitry Andric assert((isPredicatedInst(I)) &&
5818fcaf7f86SDimitry Andric "Expecting a scalar emulated instruction");
58190b57cec5SDimitry Andric return isa<LoadInst>(I) ||
58200b57cec5SDimitry Andric (isa<StoreInst>(I) &&
58210b57cec5SDimitry Andric NumPredStores > NumberOfStoresToPredicate);
58220b57cec5SDimitry Andric }
58230b57cec5SDimitry Andric
collectInstsToScalarize(ElementCount VF)5824e8d8bef9SDimitry Andric void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
58250b57cec5SDimitry Andric // If we aren't vectorizing the loop, or if we've already collected the
58260b57cec5SDimitry Andric // instructions to scalarize, there's nothing to do. Collection may already
58270b57cec5SDimitry Andric // have occurred if we have a user-selected VF and are now computing the
58280b57cec5SDimitry Andric // expected cost for interleaving.
5829fe013be4SDimitry Andric if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF))
58300b57cec5SDimitry Andric return;
58310b57cec5SDimitry Andric
58320b57cec5SDimitry Andric // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
58330b57cec5SDimitry Andric // not profitable to scalarize any instructions, the presence of VF in the
58340b57cec5SDimitry Andric // map will indicate that we've analyzed it already.
58350b57cec5SDimitry Andric ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
58360b57cec5SDimitry Andric
5837753f127fSDimitry Andric PredicatedBBsAfterVectorization[VF].clear();
5838753f127fSDimitry Andric
58390b57cec5SDimitry Andric // Find all the instructions that are scalar with predication in the loop and
58400b57cec5SDimitry Andric // determine if it would be better to not if-convert the blocks they are in.
58410b57cec5SDimitry Andric // If so, we also record the instructions to scalarize.
58420b57cec5SDimitry Andric for (BasicBlock *BB : TheLoop->blocks()) {
5843349cc55cSDimitry Andric if (!blockNeedsPredicationForAnyReason(BB))
58440b57cec5SDimitry Andric continue;
58450b57cec5SDimitry Andric for (Instruction &I : *BB)
584604eeddc0SDimitry Andric if (isScalarWithPredication(&I, VF)) {
58470b57cec5SDimitry Andric ScalarCostsTy ScalarCosts;
5848fe6060f1SDimitry Andric // Do not apply discount if scalable, because that would lead to
5849fe6060f1SDimitry Andric // invalid scalarization costs.
58500b57cec5SDimitry Andric // Do not apply discount logic if hacked cost is needed
58510b57cec5SDimitry Andric // for emulated masked memrefs.
585204eeddc0SDimitry Andric if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) &&
58530b57cec5SDimitry Andric computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
58540b57cec5SDimitry Andric ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
58550b57cec5SDimitry Andric // Remember that BB will remain after vectorization.
5856753f127fSDimitry Andric PredicatedBBsAfterVectorization[VF].insert(BB);
58570b57cec5SDimitry Andric }
58580b57cec5SDimitry Andric }
58590b57cec5SDimitry Andric }
58600b57cec5SDimitry Andric
computePredInstDiscount(Instruction * PredInst,ScalarCostsTy & ScalarCosts,ElementCount VF)5861bdd1243dSDimitry Andric InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
5862e8d8bef9SDimitry Andric Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
58630b57cec5SDimitry Andric assert(!isUniformAfterVectorization(PredInst, VF) &&
58640b57cec5SDimitry Andric "Instruction marked uniform-after-vectorization will be predicated");
58650b57cec5SDimitry Andric
58660b57cec5SDimitry Andric // Initialize the discount to zero, meaning that the scalar version and the
58670b57cec5SDimitry Andric // vector version cost the same.
5868e8d8bef9SDimitry Andric InstructionCost Discount = 0;
58690b57cec5SDimitry Andric
58700b57cec5SDimitry Andric // Holds instructions to analyze. The instructions we visit are mapped in
58710b57cec5SDimitry Andric // ScalarCosts. Those instructions are the ones that would be scalarized if
58720b57cec5SDimitry Andric // we find that the scalar version costs less.
58730b57cec5SDimitry Andric SmallVector<Instruction *, 8> Worklist;
58740b57cec5SDimitry Andric
58750b57cec5SDimitry Andric // Returns true if the given instruction can be scalarized.
58760b57cec5SDimitry Andric auto canBeScalarized = [&](Instruction *I) -> bool {
58770b57cec5SDimitry Andric // We only attempt to scalarize instructions forming a single-use chain
58780b57cec5SDimitry Andric // from the original predicated block that would otherwise be vectorized.
58790b57cec5SDimitry Andric // Although not strictly necessary, we give up on instructions we know will
58800b57cec5SDimitry Andric // already be scalar to avoid traversing chains that are unlikely to be
58810b57cec5SDimitry Andric // beneficial.
58820b57cec5SDimitry Andric if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
58830b57cec5SDimitry Andric isScalarAfterVectorization(I, VF))
58840b57cec5SDimitry Andric return false;
58850b57cec5SDimitry Andric
58860b57cec5SDimitry Andric // If the instruction is scalar with predication, it will be analyzed
58870b57cec5SDimitry Andric // separately. We ignore it within the context of PredInst.
588804eeddc0SDimitry Andric if (isScalarWithPredication(I, VF))
58890b57cec5SDimitry Andric return false;
58900b57cec5SDimitry Andric
58910b57cec5SDimitry Andric // If any of the instruction's operands are uniform after vectorization,
58920b57cec5SDimitry Andric // the instruction cannot be scalarized. This prevents, for example, a
58930b57cec5SDimitry Andric // masked load from being scalarized.
58940b57cec5SDimitry Andric //
58950b57cec5SDimitry Andric // We assume we will only emit a value for lane zero of an instruction
58960b57cec5SDimitry Andric // marked uniform after vectorization, rather than VF identical values.
58970b57cec5SDimitry Andric // Thus, if we scalarize an instruction that uses a uniform, we would
58980b57cec5SDimitry Andric // create uses of values corresponding to the lanes we aren't emitting code
58990b57cec5SDimitry Andric // for. This behavior can be changed by allowing getScalarValue to clone
59000b57cec5SDimitry Andric // the lane zero values for uniforms rather than asserting.
59010b57cec5SDimitry Andric for (Use &U : I->operands())
59020b57cec5SDimitry Andric if (auto *J = dyn_cast<Instruction>(U.get()))
59030b57cec5SDimitry Andric if (isUniformAfterVectorization(J, VF))
59040b57cec5SDimitry Andric return false;
59050b57cec5SDimitry Andric
59060b57cec5SDimitry Andric // Otherwise, we can scalarize the instruction.
59070b57cec5SDimitry Andric return true;
59080b57cec5SDimitry Andric };
59090b57cec5SDimitry Andric
59100b57cec5SDimitry Andric // Compute the expected cost discount from scalarizing the entire expression
59110b57cec5SDimitry Andric // feeding the predicated instruction. We currently only consider expressions
59120b57cec5SDimitry Andric // that are single-use instruction chains.
59130b57cec5SDimitry Andric Worklist.push_back(PredInst);
59140b57cec5SDimitry Andric while (!Worklist.empty()) {
59150b57cec5SDimitry Andric Instruction *I = Worklist.pop_back_val();
59160b57cec5SDimitry Andric
59170b57cec5SDimitry Andric // If we've already analyzed the instruction, there's nothing to do.
5918fe013be4SDimitry Andric if (ScalarCosts.contains(I))
59190b57cec5SDimitry Andric continue;
59200b57cec5SDimitry Andric
59210b57cec5SDimitry Andric // Compute the cost of the vector instruction. Note that this cost already
59220b57cec5SDimitry Andric // includes the scalarization overhead of the predicated instruction.
5923e8d8bef9SDimitry Andric InstructionCost VectorCost = getInstructionCost(I, VF).first;
59240b57cec5SDimitry Andric
59250b57cec5SDimitry Andric // Compute the cost of the scalarized instruction. This cost is the cost of
59260b57cec5SDimitry Andric // the instruction as if it wasn't if-converted and instead remained in the
59270b57cec5SDimitry Andric // predicated block. We will scale this cost by block probability after
59280b57cec5SDimitry Andric // computing the scalarization overhead.
5929e8d8bef9SDimitry Andric InstructionCost ScalarCost =
5930fe6060f1SDimitry Andric VF.getFixedValue() *
5931e8d8bef9SDimitry Andric getInstructionCost(I, ElementCount::getFixed(1)).first;
59320b57cec5SDimitry Andric
59330b57cec5SDimitry Andric // Compute the scalarization overhead of needed insertelement instructions
59340b57cec5SDimitry Andric // and phi nodes.
5935bdd1243dSDimitry Andric TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
593604eeddc0SDimitry Andric if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
59375ffd83dbSDimitry Andric ScalarCost += TTI.getScalarizationOverhead(
59385ffd83dbSDimitry Andric cast<VectorType>(ToVectorTy(I->getType(), VF)),
5939bdd1243dSDimitry Andric APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
5940bdd1243dSDimitry Andric /*Extract*/ false, CostKind);
5941e8d8bef9SDimitry Andric ScalarCost +=
5942bdd1243dSDimitry Andric VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
59430b57cec5SDimitry Andric }
59440b57cec5SDimitry Andric
59450b57cec5SDimitry Andric // Compute the scalarization overhead of needed extractelement
59460b57cec5SDimitry Andric // instructions. For each of the instruction's operands, if the operand can
59470b57cec5SDimitry Andric // be scalarized, add it to the worklist; otherwise, account for the
59480b57cec5SDimitry Andric // overhead.
59490b57cec5SDimitry Andric for (Use &U : I->operands())
59500b57cec5SDimitry Andric if (auto *J = dyn_cast<Instruction>(U.get())) {
59510b57cec5SDimitry Andric assert(VectorType::isValidElementType(J->getType()) &&
59520b57cec5SDimitry Andric "Instruction has non-scalar type");
59530b57cec5SDimitry Andric if (canBeScalarized(J))
59540b57cec5SDimitry Andric Worklist.push_back(J);
5955e8d8bef9SDimitry Andric else if (needsExtract(J, VF)) {
59560b57cec5SDimitry Andric ScalarCost += TTI.getScalarizationOverhead(
59575ffd83dbSDimitry Andric cast<VectorType>(ToVectorTy(J->getType(), VF)),
5958bdd1243dSDimitry Andric APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
5959bdd1243dSDimitry Andric /*Extract*/ true, CostKind);
5960e8d8bef9SDimitry Andric }
59610b57cec5SDimitry Andric }
59620b57cec5SDimitry Andric
59630b57cec5SDimitry Andric // Scale the total scalar cost by block probability.
59640b57cec5SDimitry Andric ScalarCost /= getReciprocalPredBlockProb();
59650b57cec5SDimitry Andric
59660b57cec5SDimitry Andric // Compute the discount. A non-negative discount means the vector version
59670b57cec5SDimitry Andric // of the instruction costs more, and scalarizing would be beneficial.
59680b57cec5SDimitry Andric Discount += VectorCost - ScalarCost;
59690b57cec5SDimitry Andric ScalarCosts[I] = ScalarCost;
59700b57cec5SDimitry Andric }
59710b57cec5SDimitry Andric
5972bdd1243dSDimitry Andric return Discount;
59730b57cec5SDimitry Andric }
59740b57cec5SDimitry Andric
59750b57cec5SDimitry Andric LoopVectorizationCostModel::VectorizationCostTy
expectedCost(ElementCount VF,SmallVectorImpl<InstructionVFPair> * Invalid)5976fe6060f1SDimitry Andric LoopVectorizationCostModel::expectedCost(
5977fe6060f1SDimitry Andric ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
59780b57cec5SDimitry Andric VectorizationCostTy Cost;
59790b57cec5SDimitry Andric
59800b57cec5SDimitry Andric // For each block.
59810b57cec5SDimitry Andric for (BasicBlock *BB : TheLoop->blocks()) {
59820b57cec5SDimitry Andric VectorizationCostTy BlockCost;
59830b57cec5SDimitry Andric
59840b57cec5SDimitry Andric // For each instruction in the old loop.
59850b57cec5SDimitry Andric for (Instruction &I : BB->instructionsWithoutDebug()) {
59860b57cec5SDimitry Andric // Skip ignored values.
5987e8d8bef9SDimitry Andric if (ValuesToIgnore.count(&I) ||
5988e8d8bef9SDimitry Andric (VF.isVector() && VecValuesToIgnore.count(&I)))
59890b57cec5SDimitry Andric continue;
59900b57cec5SDimitry Andric
59910b57cec5SDimitry Andric VectorizationCostTy C = getInstructionCost(&I, VF);
59920b57cec5SDimitry Andric
59930b57cec5SDimitry Andric // Check if we should override the cost.
5994fe6060f1SDimitry Andric if (C.first.isValid() &&
5995fe6060f1SDimitry Andric ForceTargetInstructionCost.getNumOccurrences() > 0)
5996e8d8bef9SDimitry Andric C.first = InstructionCost(ForceTargetInstructionCost);
59970b57cec5SDimitry Andric
5998fe6060f1SDimitry Andric // Keep a list of instructions with invalid costs.
5999fe6060f1SDimitry Andric if (Invalid && !C.first.isValid())
6000fe6060f1SDimitry Andric Invalid->emplace_back(&I, VF);
6001fe6060f1SDimitry Andric
60020b57cec5SDimitry Andric BlockCost.first += C.first;
60030b57cec5SDimitry Andric BlockCost.second |= C.second;
60040b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
60050b57cec5SDimitry Andric << " for VF " << VF << " For instruction: " << I
60060b57cec5SDimitry Andric << '\n');
60070b57cec5SDimitry Andric }
60080b57cec5SDimitry Andric
60090b57cec5SDimitry Andric // If we are vectorizing a predicated block, it will have been
60100b57cec5SDimitry Andric // if-converted. This means that the block's instructions (aside from
60110b57cec5SDimitry Andric // stores and instructions that may divide by zero) will now be
60120b57cec5SDimitry Andric // unconditionally executed. For the scalar case, we may not always execute
6013e8d8bef9SDimitry Andric // the predicated block, if it is an if-else block. Thus, scale the block's
6014e8d8bef9SDimitry Andric // cost by the probability of executing it. blockNeedsPredication from
6015e8d8bef9SDimitry Andric // Legal is used so as to not include all blocks in tail folded loops.
6016e8d8bef9SDimitry Andric if (VF.isScalar() && Legal->blockNeedsPredication(BB))
60170b57cec5SDimitry Andric BlockCost.first /= getReciprocalPredBlockProb();
60180b57cec5SDimitry Andric
60190b57cec5SDimitry Andric Cost.first += BlockCost.first;
60200b57cec5SDimitry Andric Cost.second |= BlockCost.second;
60210b57cec5SDimitry Andric }
60220b57cec5SDimitry Andric
60230b57cec5SDimitry Andric return Cost;
60240b57cec5SDimitry Andric }
60250b57cec5SDimitry Andric
60260b57cec5SDimitry Andric /// Gets Address Access SCEV after verifying that the access pattern
60270b57cec5SDimitry Andric /// is loop invariant except the induction variable dependence.
60280b57cec5SDimitry Andric ///
60290b57cec5SDimitry Andric /// This SCEV can be sent to the Target in order to estimate the address
60300b57cec5SDimitry Andric /// calculation cost.
getAddressAccessSCEV(Value * Ptr,LoopVectorizationLegality * Legal,PredicatedScalarEvolution & PSE,const Loop * TheLoop)60310b57cec5SDimitry Andric static const SCEV *getAddressAccessSCEV(
60320b57cec5SDimitry Andric Value *Ptr,
60330b57cec5SDimitry Andric LoopVectorizationLegality *Legal,
60340b57cec5SDimitry Andric PredicatedScalarEvolution &PSE,
60350b57cec5SDimitry Andric const Loop *TheLoop) {
60360b57cec5SDimitry Andric
60370b57cec5SDimitry Andric auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
60380b57cec5SDimitry Andric if (!Gep)
60390b57cec5SDimitry Andric return nullptr;
60400b57cec5SDimitry Andric
60410b57cec5SDimitry Andric // We are looking for a gep with all loop invariant indices except for one
60420b57cec5SDimitry Andric // which should be an induction variable.
60430b57cec5SDimitry Andric auto SE = PSE.getSE();
60440b57cec5SDimitry Andric unsigned NumOperands = Gep->getNumOperands();
60450b57cec5SDimitry Andric for (unsigned i = 1; i < NumOperands; ++i) {
60460b57cec5SDimitry Andric Value *Opd = Gep->getOperand(i);
60470b57cec5SDimitry Andric if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
60480b57cec5SDimitry Andric !Legal->isInductionVariable(Opd))
60490b57cec5SDimitry Andric return nullptr;
60500b57cec5SDimitry Andric }
60510b57cec5SDimitry Andric
60520b57cec5SDimitry Andric // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
60530b57cec5SDimitry Andric return PSE.getSCEV(Ptr);
60540b57cec5SDimitry Andric }
60550b57cec5SDimitry Andric
6056e8d8bef9SDimitry Andric InstructionCost
getMemInstScalarizationCost(Instruction * I,ElementCount VF)6057e8d8bef9SDimitry Andric LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6058e8d8bef9SDimitry Andric ElementCount VF) {
6059e8d8bef9SDimitry Andric assert(VF.isVector() &&
6060e8d8bef9SDimitry Andric "Scalarization cost of instruction implies vectorization.");
6061fe6060f1SDimitry Andric if (VF.isScalable())
6062fe6060f1SDimitry Andric return InstructionCost::getInvalid();
6063fe6060f1SDimitry Andric
6064fe6060f1SDimitry Andric Type *ValTy = getLoadStoreType(I);
60650b57cec5SDimitry Andric auto SE = PSE.getSE();
60660b57cec5SDimitry Andric
60670b57cec5SDimitry Andric unsigned AS = getLoadStoreAddressSpace(I);
60680b57cec5SDimitry Andric Value *Ptr = getLoadStorePointerOperand(I);
60690b57cec5SDimitry Andric Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
60704824e7fdSDimitry Andric // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
60714824e7fdSDimitry Andric // that it is being called from this specific place.
60720b57cec5SDimitry Andric
60730b57cec5SDimitry Andric // Figure out whether the access is strided and get the stride value
60740b57cec5SDimitry Andric // if it's known in compile time
60750b57cec5SDimitry Andric const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
60760b57cec5SDimitry Andric
60770b57cec5SDimitry Andric // Get the cost of the scalar memory instruction and address computation.
6078e8d8bef9SDimitry Andric InstructionCost Cost =
6079e8d8bef9SDimitry Andric VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
60800b57cec5SDimitry Andric
60810b57cec5SDimitry Andric // Don't pass *I here, since it is scalar but will actually be part of a
60820b57cec5SDimitry Andric // vectorized loop where the user of it is a vectorized instruction.
6083bdd1243dSDimitry Andric TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
60845ffd83dbSDimitry Andric const Align Alignment = getLoadStoreAlignment(I);
6085bdd1243dSDimitry Andric Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
6086bdd1243dSDimitry Andric ValTy->getScalarType(),
6087bdd1243dSDimitry Andric Alignment, AS, CostKind);
60880b57cec5SDimitry Andric
60890b57cec5SDimitry Andric // Get the overhead of the extractelement and insertelement instructions
60900b57cec5SDimitry Andric // we might create due to scalarization.
6091bdd1243dSDimitry Andric Cost += getScalarizationOverhead(I, VF, CostKind);
60920b57cec5SDimitry Andric
6093fe6060f1SDimitry Andric // If we have a predicated load/store, it will need extra i1 extracts and
6094fe6060f1SDimitry Andric // conditional branches, but may not be executed for each vector lane. Scale
6095fe6060f1SDimitry Andric // the cost by the probability of executing the predicated block.
6096bdd1243dSDimitry Andric if (isPredicatedInst(I)) {
60970b57cec5SDimitry Andric Cost /= getReciprocalPredBlockProb();
60980b57cec5SDimitry Andric
6099fe6060f1SDimitry Andric // Add the cost of an i1 extract and a branch
6100fe6060f1SDimitry Andric auto *Vec_i1Ty =
6101fe6060f1SDimitry Andric VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
6102fe6060f1SDimitry Andric Cost += TTI.getScalarizationOverhead(
6103349cc55cSDimitry Andric Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
6104bdd1243dSDimitry Andric /*Insert=*/false, /*Extract=*/true, CostKind);
6105bdd1243dSDimitry Andric Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
6106fe6060f1SDimitry Andric
610704eeddc0SDimitry Andric if (useEmulatedMaskMemRefHack(I, VF))
61080b57cec5SDimitry Andric // Artificially setting to a high enough value to practically disable
61090b57cec5SDimitry Andric // vectorization with such operations.
61100b57cec5SDimitry Andric Cost = 3000000;
61110b57cec5SDimitry Andric }
61120b57cec5SDimitry Andric
61130b57cec5SDimitry Andric return Cost;
61140b57cec5SDimitry Andric }
61150b57cec5SDimitry Andric
6116e8d8bef9SDimitry Andric InstructionCost
getConsecutiveMemOpCost(Instruction * I,ElementCount VF)6117e8d8bef9SDimitry Andric LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6118e8d8bef9SDimitry Andric ElementCount VF) {
6119fe6060f1SDimitry Andric Type *ValTy = getLoadStoreType(I);
61205ffd83dbSDimitry Andric auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
61210b57cec5SDimitry Andric Value *Ptr = getLoadStorePointerOperand(I);
61220b57cec5SDimitry Andric unsigned AS = getLoadStoreAddressSpace(I);
6123349cc55cSDimitry Andric int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
61245ffd83dbSDimitry Andric enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
61250b57cec5SDimitry Andric
61260b57cec5SDimitry Andric assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
61270b57cec5SDimitry Andric "Stride should be 1 or -1 for consecutive memory access");
61285ffd83dbSDimitry Andric const Align Alignment = getLoadStoreAlignment(I);
6129e8d8bef9SDimitry Andric InstructionCost Cost = 0;
6130bdd1243dSDimitry Andric if (Legal->isMaskRequired(I)) {
61315ffd83dbSDimitry Andric Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
61325ffd83dbSDimitry Andric CostKind);
6133bdd1243dSDimitry Andric } else {
6134bdd1243dSDimitry Andric TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
61355ffd83dbSDimitry Andric Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6136bdd1243dSDimitry Andric CostKind, OpInfo, I);
6137bdd1243dSDimitry Andric }
61380b57cec5SDimitry Andric
61390b57cec5SDimitry Andric bool Reverse = ConsecutiveStride < 0;
61400b57cec5SDimitry Andric if (Reverse)
6141bdd1243dSDimitry Andric Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
6142bdd1243dSDimitry Andric std::nullopt, CostKind, 0);
61430b57cec5SDimitry Andric return Cost;
61440b57cec5SDimitry Andric }
61450b57cec5SDimitry Andric
6146e8d8bef9SDimitry Andric InstructionCost
getUniformMemOpCost(Instruction * I,ElementCount VF)6147e8d8bef9SDimitry Andric LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6148e8d8bef9SDimitry Andric ElementCount VF) {
6149fe013be4SDimitry Andric assert(Legal->isUniformMemOp(*I, VF));
6150e8d8bef9SDimitry Andric
6151fe6060f1SDimitry Andric Type *ValTy = getLoadStoreType(I);
61525ffd83dbSDimitry Andric auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
61535ffd83dbSDimitry Andric const Align Alignment = getLoadStoreAlignment(I);
61540b57cec5SDimitry Andric unsigned AS = getLoadStoreAddressSpace(I);
61555ffd83dbSDimitry Andric enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
61560b57cec5SDimitry Andric if (isa<LoadInst>(I)) {
61570b57cec5SDimitry Andric return TTI.getAddressComputationCost(ValTy) +
61585ffd83dbSDimitry Andric TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
61595ffd83dbSDimitry Andric CostKind) +
61600b57cec5SDimitry Andric TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
61610b57cec5SDimitry Andric }
61620b57cec5SDimitry Andric StoreInst *SI = cast<StoreInst>(I);
61630b57cec5SDimitry Andric
6164fe013be4SDimitry Andric bool isLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
61650b57cec5SDimitry Andric return TTI.getAddressComputationCost(ValTy) +
61665ffd83dbSDimitry Andric TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
61675ffd83dbSDimitry Andric CostKind) +
61688bcb0991SDimitry Andric (isLoopInvariantStoreValue
61698bcb0991SDimitry Andric ? 0
61708bcb0991SDimitry Andric : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6171bdd1243dSDimitry Andric CostKind, VF.getKnownMinValue() - 1));
61720b57cec5SDimitry Andric }
61730b57cec5SDimitry Andric
6174e8d8bef9SDimitry Andric InstructionCost
getGatherScatterCost(Instruction * I,ElementCount VF)6175e8d8bef9SDimitry Andric LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6176e8d8bef9SDimitry Andric ElementCount VF) {
6177fe6060f1SDimitry Andric Type *ValTy = getLoadStoreType(I);
61785ffd83dbSDimitry Andric auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
61795ffd83dbSDimitry Andric const Align Alignment = getLoadStoreAlignment(I);
61805ffd83dbSDimitry Andric const Value *Ptr = getLoadStorePointerOperand(I);
61810b57cec5SDimitry Andric
61820b57cec5SDimitry Andric return TTI.getAddressComputationCost(VectorTy) +
61835ffd83dbSDimitry Andric TTI.getGatherScatterOpCost(
61845ffd83dbSDimitry Andric I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
61855ffd83dbSDimitry Andric TargetTransformInfo::TCK_RecipThroughput, I);
61860b57cec5SDimitry Andric }
61870b57cec5SDimitry Andric
6188e8d8bef9SDimitry Andric InstructionCost
getInterleaveGroupCost(Instruction * I,ElementCount VF)6189e8d8bef9SDimitry Andric LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6190e8d8bef9SDimitry Andric ElementCount VF) {
6191fe6060f1SDimitry Andric Type *ValTy = getLoadStoreType(I);
61925ffd83dbSDimitry Andric auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
61930b57cec5SDimitry Andric unsigned AS = getLoadStoreAddressSpace(I);
6194bdd1243dSDimitry Andric enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
61950b57cec5SDimitry Andric
61960b57cec5SDimitry Andric auto Group = getInterleavedAccessGroup(I);
61970b57cec5SDimitry Andric assert(Group && "Fail to get an interleaved access group.");
61980b57cec5SDimitry Andric
61990b57cec5SDimitry Andric unsigned InterleaveFactor = Group->getFactor();
6200e8d8bef9SDimitry Andric auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
62010b57cec5SDimitry Andric
6202349cc55cSDimitry Andric // Holds the indices of existing members in the interleaved group.
62030b57cec5SDimitry Andric SmallVector<unsigned, 4> Indices;
6204349cc55cSDimitry Andric for (unsigned IF = 0; IF < InterleaveFactor; IF++)
6205349cc55cSDimitry Andric if (Group->getMember(IF))
6206349cc55cSDimitry Andric Indices.push_back(IF);
62070b57cec5SDimitry Andric
62080b57cec5SDimitry Andric // Calculate the cost of the whole interleaved group.
62090b57cec5SDimitry Andric bool UseMaskForGaps =
6210349cc55cSDimitry Andric (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
6211349cc55cSDimitry Andric (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
6212e8d8bef9SDimitry Andric InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
62135ffd83dbSDimitry Andric I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6214bdd1243dSDimitry Andric AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps);
62150b57cec5SDimitry Andric
62160b57cec5SDimitry Andric if (Group->isReverse()) {
62170b57cec5SDimitry Andric // TODO: Add support for reversed masked interleaved access.
62180b57cec5SDimitry Andric assert(!Legal->isMaskRequired(I) &&
62190b57cec5SDimitry Andric "Reverse masked interleaved access not supported.");
6220bdd1243dSDimitry Andric Cost += Group->getNumMembers() *
6221bdd1243dSDimitry Andric TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
6222bdd1243dSDimitry Andric std::nullopt, CostKind, 0);
62230b57cec5SDimitry Andric }
62240b57cec5SDimitry Andric return Cost;
62250b57cec5SDimitry Andric }
62260b57cec5SDimitry Andric
6227bdd1243dSDimitry Andric std::optional<InstructionCost>
getReductionPatternCost(Instruction * I,ElementCount VF,Type * Ty,TTI::TargetCostKind CostKind) const6228bdd1243dSDimitry Andric LoopVectorizationCostModel::getReductionPatternCost(
6229c9157d92SDimitry Andric Instruction *I, ElementCount VF, Type *Ty,
6230c9157d92SDimitry Andric TTI::TargetCostKind CostKind) const {
6231fe6060f1SDimitry Andric using namespace llvm::PatternMatch;
6232e8d8bef9SDimitry Andric // Early exit for no inloop reductions
6233c9157d92SDimitry Andric if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
6234bdd1243dSDimitry Andric return std::nullopt;
6235e8d8bef9SDimitry Andric auto *VectorTy = cast<VectorType>(Ty);
6236e8d8bef9SDimitry Andric
6237e8d8bef9SDimitry Andric // We are looking for a pattern of, and finding the minimal acceptable cost:
6238e8d8bef9SDimitry Andric // reduce(mul(ext(A), ext(B))) or
6239e8d8bef9SDimitry Andric // reduce(mul(A, B)) or
6240e8d8bef9SDimitry Andric // reduce(ext(A)) or
6241e8d8bef9SDimitry Andric // reduce(A).
6242e8d8bef9SDimitry Andric // The basic idea is that we walk down the tree to do that, finding the root
6243e8d8bef9SDimitry Andric // reduction instruction in InLoopReductionImmediateChains. From there we find
6244e8d8bef9SDimitry Andric // the pattern of mul/ext and test the cost of the entire pattern vs the cost
6245e8d8bef9SDimitry Andric // of the components. If the reduction cost is lower then we return it for the
6246e8d8bef9SDimitry Andric // reduction instruction and 0 for the other instructions in the pattern. If
6247e8d8bef9SDimitry Andric // it is not we return an invalid cost specifying the orignal cost method
6248e8d8bef9SDimitry Andric // should be used.
6249e8d8bef9SDimitry Andric Instruction *RetI = I;
6250fe6060f1SDimitry Andric if (match(RetI, m_ZExtOrSExt(m_Value()))) {
6251e8d8bef9SDimitry Andric if (!RetI->hasOneUser())
6252bdd1243dSDimitry Andric return std::nullopt;
6253e8d8bef9SDimitry Andric RetI = RetI->user_back();
6254e8d8bef9SDimitry Andric }
6255bdd1243dSDimitry Andric
6256bdd1243dSDimitry Andric if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
6257e8d8bef9SDimitry Andric RetI->user_back()->getOpcode() == Instruction::Add) {
6258e8d8bef9SDimitry Andric RetI = RetI->user_back();
6259e8d8bef9SDimitry Andric }
6260e8d8bef9SDimitry Andric
6261e8d8bef9SDimitry Andric // Test if the found instruction is a reduction, and if not return an invalid
6262e8d8bef9SDimitry Andric // cost specifying the parent to use the original cost modelling.
6263e8d8bef9SDimitry Andric if (!InLoopReductionImmediateChains.count(RetI))
6264bdd1243dSDimitry Andric return std::nullopt;
6265e8d8bef9SDimitry Andric
6266e8d8bef9SDimitry Andric // Find the reduction this chain is a part of and calculate the basic cost of
6267e8d8bef9SDimitry Andric // the reduction on its own.
6268c9157d92SDimitry Andric Instruction *LastChain = InLoopReductionImmediateChains.at(RetI);
6269e8d8bef9SDimitry Andric Instruction *ReductionPhi = LastChain;
6270e8d8bef9SDimitry Andric while (!isa<PHINode>(ReductionPhi))
6271c9157d92SDimitry Andric ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi);
6272e8d8bef9SDimitry Andric
6273fe6060f1SDimitry Andric const RecurrenceDescriptor &RdxDesc =
62740eae32dcSDimitry Andric Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
6275fe6060f1SDimitry Andric
6276fe6060f1SDimitry Andric InstructionCost BaseCost = TTI.getArithmeticReductionCost(
6277fe6060f1SDimitry Andric RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
6278fe6060f1SDimitry Andric
62794824e7fdSDimitry Andric // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
62804824e7fdSDimitry Andric // normal fmul instruction to the cost of the fadd reduction.
62814824e7fdSDimitry Andric if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)
62824824e7fdSDimitry Andric BaseCost +=
62834824e7fdSDimitry Andric TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
62844824e7fdSDimitry Andric
6285fe6060f1SDimitry Andric // If we're using ordered reductions then we can just return the base cost
6286fe6060f1SDimitry Andric // here, since getArithmeticReductionCost calculates the full ordered
6287fe6060f1SDimitry Andric // reduction cost when FP reassociation is not allowed.
6288fe6060f1SDimitry Andric if (useOrderedReductions(RdxDesc))
6289fe6060f1SDimitry Andric return BaseCost;
6290e8d8bef9SDimitry Andric
6291e8d8bef9SDimitry Andric // Get the operand that was not the reduction chain and match it to one of the
6292e8d8bef9SDimitry Andric // patterns, returning the better cost if it is found.
6293e8d8bef9SDimitry Andric Instruction *RedOp = RetI->getOperand(1) == LastChain
6294e8d8bef9SDimitry Andric ? dyn_cast<Instruction>(RetI->getOperand(0))
6295e8d8bef9SDimitry Andric : dyn_cast<Instruction>(RetI->getOperand(1));
6296e8d8bef9SDimitry Andric
6297e8d8bef9SDimitry Andric VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
6298e8d8bef9SDimitry Andric
6299fe6060f1SDimitry Andric Instruction *Op0, *Op1;
6300bdd1243dSDimitry Andric if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6301349cc55cSDimitry Andric match(RedOp,
6302349cc55cSDimitry Andric m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
6303349cc55cSDimitry Andric match(Op0, m_ZExtOrSExt(m_Value())) &&
6304349cc55cSDimitry Andric Op0->getOpcode() == Op1->getOpcode() &&
6305349cc55cSDimitry Andric Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
6306349cc55cSDimitry Andric !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
6307349cc55cSDimitry Andric (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
6308349cc55cSDimitry Andric
6309bdd1243dSDimitry Andric // Matched reduce.add(ext(mul(ext(A), ext(B)))
6310349cc55cSDimitry Andric // Note that the extend opcodes need to all match, or if A==B they will have
6311349cc55cSDimitry Andric // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
6312349cc55cSDimitry Andric // which is equally fine.
6313349cc55cSDimitry Andric bool IsUnsigned = isa<ZExtInst>(Op0);
6314349cc55cSDimitry Andric auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
6315349cc55cSDimitry Andric auto *MulType = VectorType::get(Op0->getType(), VectorTy);
6316349cc55cSDimitry Andric
6317349cc55cSDimitry Andric InstructionCost ExtCost =
6318349cc55cSDimitry Andric TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
6319349cc55cSDimitry Andric TTI::CastContextHint::None, CostKind, Op0);
6320349cc55cSDimitry Andric InstructionCost MulCost =
6321349cc55cSDimitry Andric TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
6322349cc55cSDimitry Andric InstructionCost Ext2Cost =
6323349cc55cSDimitry Andric TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
6324349cc55cSDimitry Andric TTI::CastContextHint::None, CostKind, RedOp);
6325349cc55cSDimitry Andric
6326bdd1243dSDimitry Andric InstructionCost RedCost = TTI.getMulAccReductionCost(
6327bdd1243dSDimitry Andric IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
6328349cc55cSDimitry Andric
6329349cc55cSDimitry Andric if (RedCost.isValid() &&
6330349cc55cSDimitry Andric RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
6331349cc55cSDimitry Andric return I == RetI ? RedCost : 0;
6332349cc55cSDimitry Andric } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
6333e8d8bef9SDimitry Andric !TheLoop->isLoopInvariant(RedOp)) {
6334fe6060f1SDimitry Andric // Matched reduce(ext(A))
6335e8d8bef9SDimitry Andric bool IsUnsigned = isa<ZExtInst>(RedOp);
6336e8d8bef9SDimitry Andric auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
6337bdd1243dSDimitry Andric InstructionCost RedCost = TTI.getExtendedReductionCost(
6338bdd1243dSDimitry Andric RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6339bdd1243dSDimitry Andric RdxDesc.getFastMathFlags(), CostKind);
6340e8d8bef9SDimitry Andric
6341fe6060f1SDimitry Andric InstructionCost ExtCost =
6342e8d8bef9SDimitry Andric TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
6343e8d8bef9SDimitry Andric TTI::CastContextHint::None, CostKind, RedOp);
6344e8d8bef9SDimitry Andric if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
6345fe6060f1SDimitry Andric return I == RetI ? RedCost : 0;
6346bdd1243dSDimitry Andric } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6347fe6060f1SDimitry Andric match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
6348fe6060f1SDimitry Andric if (match(Op0, m_ZExtOrSExt(m_Value())) &&
6349e8d8bef9SDimitry Andric Op0->getOpcode() == Op1->getOpcode() &&
6350e8d8bef9SDimitry Andric !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
6351e8d8bef9SDimitry Andric bool IsUnsigned = isa<ZExtInst>(Op0);
63520eae32dcSDimitry Andric Type *Op0Ty = Op0->getOperand(0)->getType();
63530eae32dcSDimitry Andric Type *Op1Ty = Op1->getOperand(0)->getType();
63540eae32dcSDimitry Andric Type *LargestOpTy =
63550eae32dcSDimitry Andric Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
63560eae32dcSDimitry Andric : Op0Ty;
63570eae32dcSDimitry Andric auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
63580eae32dcSDimitry Andric
6359bdd1243dSDimitry Andric // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
63600eae32dcSDimitry Andric // different sizes. We take the largest type as the ext to reduce, and add
63610eae32dcSDimitry Andric // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
63620eae32dcSDimitry Andric InstructionCost ExtCost0 = TTI.getCastInstrCost(
63630eae32dcSDimitry Andric Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
6364e8d8bef9SDimitry Andric TTI::CastContextHint::None, CostKind, Op0);
63650eae32dcSDimitry Andric InstructionCost ExtCost1 = TTI.getCastInstrCost(
63660eae32dcSDimitry Andric Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
63670eae32dcSDimitry Andric TTI::CastContextHint::None, CostKind, Op1);
6368fe6060f1SDimitry Andric InstructionCost MulCost =
6369fe6060f1SDimitry Andric TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6370e8d8bef9SDimitry Andric
6371bdd1243dSDimitry Andric InstructionCost RedCost = TTI.getMulAccReductionCost(
6372bdd1243dSDimitry Andric IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
63730eae32dcSDimitry Andric InstructionCost ExtraExtCost = 0;
63740eae32dcSDimitry Andric if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
63750eae32dcSDimitry Andric Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
63760eae32dcSDimitry Andric ExtraExtCost = TTI.getCastInstrCost(
63770eae32dcSDimitry Andric ExtraExtOp->getOpcode(), ExtType,
63780eae32dcSDimitry Andric VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
63790eae32dcSDimitry Andric TTI::CastContextHint::None, CostKind, ExtraExtOp);
63800eae32dcSDimitry Andric }
6381e8d8bef9SDimitry Andric
63820eae32dcSDimitry Andric if (RedCost.isValid() &&
63830eae32dcSDimitry Andric (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
6384fe6060f1SDimitry Andric return I == RetI ? RedCost : 0;
6385349cc55cSDimitry Andric } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
6386bdd1243dSDimitry Andric // Matched reduce.add(mul())
6387fe6060f1SDimitry Andric InstructionCost MulCost =
6388fe6060f1SDimitry Andric TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6389e8d8bef9SDimitry Andric
6390bdd1243dSDimitry Andric InstructionCost RedCost = TTI.getMulAccReductionCost(
6391bdd1243dSDimitry Andric true, RdxDesc.getRecurrenceType(), VectorTy, CostKind);
6392e8d8bef9SDimitry Andric
6393e8d8bef9SDimitry Andric if (RedCost.isValid() && RedCost < MulCost + BaseCost)
6394fe6060f1SDimitry Andric return I == RetI ? RedCost : 0;
6395e8d8bef9SDimitry Andric }
6396e8d8bef9SDimitry Andric }
6397e8d8bef9SDimitry Andric
6398bdd1243dSDimitry Andric return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
6399e8d8bef9SDimitry Andric }
6400e8d8bef9SDimitry Andric
6401e8d8bef9SDimitry Andric InstructionCost
getMemoryInstructionCost(Instruction * I,ElementCount VF)6402e8d8bef9SDimitry Andric LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6403e8d8bef9SDimitry Andric ElementCount VF) {
64040b57cec5SDimitry Andric // Calculate scalar cost only. Vectorization cost should be ready at this
64050b57cec5SDimitry Andric // moment.
6406e8d8bef9SDimitry Andric if (VF.isScalar()) {
6407fe6060f1SDimitry Andric Type *ValTy = getLoadStoreType(I);
64085ffd83dbSDimitry Andric const Align Alignment = getLoadStoreAlignment(I);
64090b57cec5SDimitry Andric unsigned AS = getLoadStoreAddressSpace(I);
64100b57cec5SDimitry Andric
6411bdd1243dSDimitry Andric TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
64120b57cec5SDimitry Andric return TTI.getAddressComputationCost(ValTy) +
64135ffd83dbSDimitry Andric TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6414bdd1243dSDimitry Andric TTI::TCK_RecipThroughput, OpInfo, I);
64150b57cec5SDimitry Andric }
64160b57cec5SDimitry Andric return getWideningCost(I, VF);
64170b57cec5SDimitry Andric }
64180b57cec5SDimitry Andric
64190b57cec5SDimitry Andric LoopVectorizationCostModel::VectorizationCostTy
getInstructionCost(Instruction * I,ElementCount VF)6420e8d8bef9SDimitry Andric LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6421e8d8bef9SDimitry Andric ElementCount VF) {
64220b57cec5SDimitry Andric // If we know that this instruction will remain uniform, check the cost of
64230b57cec5SDimitry Andric // the scalar version.
64240b57cec5SDimitry Andric if (isUniformAfterVectorization(I, VF))
6425e8d8bef9SDimitry Andric VF = ElementCount::getFixed(1);
64260b57cec5SDimitry Andric
6427e8d8bef9SDimitry Andric if (VF.isVector() && isProfitableToScalarize(I, VF))
64280b57cec5SDimitry Andric return VectorizationCostTy(InstsToScalarize[VF][I], false);
64290b57cec5SDimitry Andric
64300b57cec5SDimitry Andric // Forced scalars do not have any scalarization overhead.
64310b57cec5SDimitry Andric auto ForcedScalar = ForcedScalars.find(VF);
6432e8d8bef9SDimitry Andric if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
64330b57cec5SDimitry Andric auto InstSet = ForcedScalar->second;
64345ffd83dbSDimitry Andric if (InstSet.count(I))
6435e8d8bef9SDimitry Andric return VectorizationCostTy(
6436e8d8bef9SDimitry Andric (getInstructionCost(I, ElementCount::getFixed(1)).first *
6437e8d8bef9SDimitry Andric VF.getKnownMinValue()),
6438e8d8bef9SDimitry Andric false);
64390b57cec5SDimitry Andric }
64400b57cec5SDimitry Andric
64410b57cec5SDimitry Andric Type *VectorTy;
6442e8d8bef9SDimitry Andric InstructionCost C = getInstructionCost(I, VF, VectorTy);
64430b57cec5SDimitry Andric
6444349cc55cSDimitry Andric bool TypeNotScalarized = false;
6445349cc55cSDimitry Andric if (VF.isVector() && VectorTy->isVectorTy()) {
644681ad6265SDimitry Andric if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) {
644781ad6265SDimitry Andric if (VF.isScalable())
644881ad6265SDimitry Andric // <vscale x 1 x iN> is assumed to be profitable over iN because
644981ad6265SDimitry Andric // scalable registers are a distinct register class from scalar ones.
645081ad6265SDimitry Andric // If we ever find a target which wants to lower scalable vectors
645181ad6265SDimitry Andric // back to scalars, we'll need to update this code to explicitly
645281ad6265SDimitry Andric // ask TTI about the register class uses for each part.
645381ad6265SDimitry Andric TypeNotScalarized = NumParts <= VF.getKnownMinValue();
6454349cc55cSDimitry Andric else
645581ad6265SDimitry Andric TypeNotScalarized = NumParts < VF.getKnownMinValue();
645681ad6265SDimitry Andric } else
6457349cc55cSDimitry Andric C = InstructionCost::getInvalid();
6458349cc55cSDimitry Andric }
64590b57cec5SDimitry Andric return VectorizationCostTy(C, TypeNotScalarized);
64600b57cec5SDimitry Andric }
64610b57cec5SDimitry Andric
getScalarizationOverhead(Instruction * I,ElementCount VF,TTI::TargetCostKind CostKind) const6462bdd1243dSDimitry Andric InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
6463bdd1243dSDimitry Andric Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const {
64640b57cec5SDimitry Andric
6465fe6060f1SDimitry Andric // There is no mechanism yet to create a scalable scalarization loop,
6466fe6060f1SDimitry Andric // so this is currently Invalid.
6467fe6060f1SDimitry Andric if (VF.isScalable())
6468fe6060f1SDimitry Andric return InstructionCost::getInvalid();
6469fe6060f1SDimitry Andric
6470e8d8bef9SDimitry Andric if (VF.isScalar())
64710b57cec5SDimitry Andric return 0;
64720b57cec5SDimitry Andric
6473e8d8bef9SDimitry Andric InstructionCost Cost = 0;
64740b57cec5SDimitry Andric Type *RetTy = ToVectorTy(I->getType(), VF);
64750b57cec5SDimitry Andric if (!RetTy->isVoidTy() &&
64760b57cec5SDimitry Andric (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
64775ffd83dbSDimitry Andric Cost += TTI.getScalarizationOverhead(
6478bdd1243dSDimitry Andric cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
6479bdd1243dSDimitry Andric /*Insert*/ true,
6480bdd1243dSDimitry Andric /*Extract*/ false, CostKind);
64810b57cec5SDimitry Andric
64820b57cec5SDimitry Andric // Some targets keep addresses scalar.
64830b57cec5SDimitry Andric if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
64840b57cec5SDimitry Andric return Cost;
64850b57cec5SDimitry Andric
64860b57cec5SDimitry Andric // Some targets support efficient element stores.
64870b57cec5SDimitry Andric if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
64880b57cec5SDimitry Andric return Cost;
64890b57cec5SDimitry Andric
64900b57cec5SDimitry Andric // Collect operands to consider.
64910b57cec5SDimitry Andric CallInst *CI = dyn_cast<CallInst>(I);
6492349cc55cSDimitry Andric Instruction::op_range Ops = CI ? CI->args() : I->operands();
64930b57cec5SDimitry Andric
64940b57cec5SDimitry Andric // Skip operands that do not require extraction/scalarization and do not incur
64950b57cec5SDimitry Andric // any overhead.
6496fe6060f1SDimitry Andric SmallVector<Type *> Tys;
6497fe6060f1SDimitry Andric for (auto *V : filterExtractingOperands(Ops, VF))
6498fe6060f1SDimitry Andric Tys.push_back(MaybeVectorizeType(V->getType(), VF));
64990b57cec5SDimitry Andric return Cost + TTI.getOperandsScalarizationOverhead(
6500bdd1243dSDimitry Andric filterExtractingOperands(Ops, VF), Tys, CostKind);
65010b57cec5SDimitry Andric }
65020b57cec5SDimitry Andric
setCostBasedWideningDecision(ElementCount VF)6503e8d8bef9SDimitry Andric void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6504e8d8bef9SDimitry Andric if (VF.isScalar())
65050b57cec5SDimitry Andric return;
65060b57cec5SDimitry Andric NumPredStores = 0;
65070b57cec5SDimitry Andric for (BasicBlock *BB : TheLoop->blocks()) {
65080b57cec5SDimitry Andric // For each instruction in the old loop.
65090b57cec5SDimitry Andric for (Instruction &I : *BB) {
65100b57cec5SDimitry Andric Value *Ptr = getLoadStorePointerOperand(&I);
65110b57cec5SDimitry Andric if (!Ptr)
65120b57cec5SDimitry Andric continue;
65130b57cec5SDimitry Andric
65140b57cec5SDimitry Andric // TODO: We should generate better code and update the cost model for
65150b57cec5SDimitry Andric // predicated uniform stores. Today they are treated as any other
65160b57cec5SDimitry Andric // predicated store (see added test cases in
65170b57cec5SDimitry Andric // invariant-store-vectorization.ll).
651804eeddc0SDimitry Andric if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
65190b57cec5SDimitry Andric NumPredStores++;
65200b57cec5SDimitry Andric
6521fe013be4SDimitry Andric if (Legal->isUniformMemOp(I, VF)) {
6522bdd1243dSDimitry Andric auto isLegalToScalarize = [&]() {
6523bdd1243dSDimitry Andric if (!VF.isScalable())
6524bdd1243dSDimitry Andric // Scalarization of fixed length vectors "just works".
6525bdd1243dSDimitry Andric return true;
6526bdd1243dSDimitry Andric
6527bdd1243dSDimitry Andric // We have dedicated lowering for unpredicated uniform loads and
6528bdd1243dSDimitry Andric // stores. Note that even with tail folding we know that at least
6529bdd1243dSDimitry Andric // one lane is active (i.e. generalized predication is not possible
6530bdd1243dSDimitry Andric // here), and the logic below depends on this fact.
6531bdd1243dSDimitry Andric if (!foldTailByMasking())
6532bdd1243dSDimitry Andric return true;
6533bdd1243dSDimitry Andric
6534bdd1243dSDimitry Andric // For scalable vectors, a uniform memop load is always
6535bdd1243dSDimitry Andric // uniform-by-parts and we know how to scalarize that.
6536bdd1243dSDimitry Andric if (isa<LoadInst>(I))
6537bdd1243dSDimitry Andric return true;
6538bdd1243dSDimitry Andric
6539bdd1243dSDimitry Andric // A uniform store isn't neccessarily uniform-by-part
6540bdd1243dSDimitry Andric // and we can't assume scalarization.
6541bdd1243dSDimitry Andric auto &SI = cast<StoreInst>(I);
6542bdd1243dSDimitry Andric return TheLoop->isLoopInvariant(SI.getValueOperand());
6543bdd1243dSDimitry Andric };
6544bdd1243dSDimitry Andric
6545bdd1243dSDimitry Andric const InstructionCost GatherScatterCost =
6546bdd1243dSDimitry Andric isLegalGatherOrScatter(&I, VF) ?
6547bdd1243dSDimitry Andric getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
6548bdd1243dSDimitry Andric
65490b57cec5SDimitry Andric // Load: Scalar load + broadcast
65500b57cec5SDimitry Andric // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6551bdd1243dSDimitry Andric // FIXME: This cost is a significant under-estimate for tail folded
6552bdd1243dSDimitry Andric // memory ops.
6553bdd1243dSDimitry Andric const InstructionCost ScalarizationCost = isLegalToScalarize() ?
6554bdd1243dSDimitry Andric getUniformMemOpCost(&I, VF) : InstructionCost::getInvalid();
6555bdd1243dSDimitry Andric
6556bdd1243dSDimitry Andric // Choose better solution for the current VF, Note that Invalid
6557bdd1243dSDimitry Andric // costs compare as maximumal large. If both are invalid, we get
6558bdd1243dSDimitry Andric // scalable invalid which signals a failure and a vectorization abort.
6559bdd1243dSDimitry Andric if (GatherScatterCost < ScalarizationCost)
6560bdd1243dSDimitry Andric setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
6561bdd1243dSDimitry Andric else
6562bdd1243dSDimitry Andric setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
65630b57cec5SDimitry Andric continue;
65640b57cec5SDimitry Andric }
65650b57cec5SDimitry Andric
65660b57cec5SDimitry Andric // We assume that widening is the best solution when possible.
65670b57cec5SDimitry Andric if (memoryInstructionCanBeWidened(&I, VF)) {
6568e8d8bef9SDimitry Andric InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6569349cc55cSDimitry Andric int ConsecutiveStride = Legal->isConsecutivePtr(
6570349cc55cSDimitry Andric getLoadStoreType(&I), getLoadStorePointerOperand(&I));
65710b57cec5SDimitry Andric assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
65720b57cec5SDimitry Andric "Expected consecutive stride.");
65730b57cec5SDimitry Andric InstWidening Decision =
65740b57cec5SDimitry Andric ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
65750b57cec5SDimitry Andric setWideningDecision(&I, VF, Decision, Cost);
65760b57cec5SDimitry Andric continue;
65770b57cec5SDimitry Andric }
65780b57cec5SDimitry Andric
65790b57cec5SDimitry Andric // Choose between Interleaving, Gather/Scatter or Scalarization.
6580fe6060f1SDimitry Andric InstructionCost InterleaveCost = InstructionCost::getInvalid();
65810b57cec5SDimitry Andric unsigned NumAccesses = 1;
65820b57cec5SDimitry Andric if (isAccessInterleaved(&I)) {
65830b57cec5SDimitry Andric auto Group = getInterleavedAccessGroup(&I);
65840b57cec5SDimitry Andric assert(Group && "Fail to get an interleaved access group.");
65850b57cec5SDimitry Andric
65860b57cec5SDimitry Andric // Make one decision for the whole group.
65870b57cec5SDimitry Andric if (getWideningDecision(&I, VF) != CM_Unknown)
65880b57cec5SDimitry Andric continue;
65890b57cec5SDimitry Andric
65900b57cec5SDimitry Andric NumAccesses = Group->getNumMembers();
65910b57cec5SDimitry Andric if (interleavedAccessCanBeWidened(&I, VF))
65920b57cec5SDimitry Andric InterleaveCost = getInterleaveGroupCost(&I, VF);
65930b57cec5SDimitry Andric }
65940b57cec5SDimitry Andric
6595e8d8bef9SDimitry Andric InstructionCost GatherScatterCost =
659604eeddc0SDimitry Andric isLegalGatherOrScatter(&I, VF)
65970b57cec5SDimitry Andric ? getGatherScatterCost(&I, VF) * NumAccesses
6598fe6060f1SDimitry Andric : InstructionCost::getInvalid();
65990b57cec5SDimitry Andric
6600e8d8bef9SDimitry Andric InstructionCost ScalarizationCost =
66010b57cec5SDimitry Andric getMemInstScalarizationCost(&I, VF) * NumAccesses;
66020b57cec5SDimitry Andric
66030b57cec5SDimitry Andric // Choose better solution for the current VF,
66040b57cec5SDimitry Andric // write down this decision and use it during vectorization.
6605e8d8bef9SDimitry Andric InstructionCost Cost;
66060b57cec5SDimitry Andric InstWidening Decision;
66070b57cec5SDimitry Andric if (InterleaveCost <= GatherScatterCost &&
66080b57cec5SDimitry Andric InterleaveCost < ScalarizationCost) {
66090b57cec5SDimitry Andric Decision = CM_Interleave;
66100b57cec5SDimitry Andric Cost = InterleaveCost;
66110b57cec5SDimitry Andric } else if (GatherScatterCost < ScalarizationCost) {
66120b57cec5SDimitry Andric Decision = CM_GatherScatter;
66130b57cec5SDimitry Andric Cost = GatherScatterCost;
66140b57cec5SDimitry Andric } else {
66150b57cec5SDimitry Andric Decision = CM_Scalarize;
66160b57cec5SDimitry Andric Cost = ScalarizationCost;
66170b57cec5SDimitry Andric }
66180b57cec5SDimitry Andric // If the instructions belongs to an interleave group, the whole group
66190b57cec5SDimitry Andric // receives the same decision. The whole group receives the cost, but
66200b57cec5SDimitry Andric // the cost will actually be assigned to one instruction.
66210b57cec5SDimitry Andric if (auto Group = getInterleavedAccessGroup(&I))
66220b57cec5SDimitry Andric setWideningDecision(Group, VF, Decision, Cost);
66230b57cec5SDimitry Andric else
66240b57cec5SDimitry Andric setWideningDecision(&I, VF, Decision, Cost);
66250b57cec5SDimitry Andric }
66260b57cec5SDimitry Andric }
66270b57cec5SDimitry Andric
66280b57cec5SDimitry Andric // Make sure that any load of address and any other address computation
66290b57cec5SDimitry Andric // remains scalar unless there is gather/scatter support. This avoids
66300b57cec5SDimitry Andric // inevitable extracts into address registers, and also has the benefit of
66310b57cec5SDimitry Andric // activating LSR more, since that pass can't optimize vectorized
66320b57cec5SDimitry Andric // addresses.
66330b57cec5SDimitry Andric if (TTI.prefersVectorizedAddressing())
66340b57cec5SDimitry Andric return;
66350b57cec5SDimitry Andric
66360b57cec5SDimitry Andric // Start with all scalar pointer uses.
66370b57cec5SDimitry Andric SmallPtrSet<Instruction *, 8> AddrDefs;
66380b57cec5SDimitry Andric for (BasicBlock *BB : TheLoop->blocks())
66390b57cec5SDimitry Andric for (Instruction &I : *BB) {
66400b57cec5SDimitry Andric Instruction *PtrDef =
66410b57cec5SDimitry Andric dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
66420b57cec5SDimitry Andric if (PtrDef && TheLoop->contains(PtrDef) &&
66430b57cec5SDimitry Andric getWideningDecision(&I, VF) != CM_GatherScatter)
66440b57cec5SDimitry Andric AddrDefs.insert(PtrDef);
66450b57cec5SDimitry Andric }
66460b57cec5SDimitry Andric
66470b57cec5SDimitry Andric // Add all instructions used to generate the addresses.
66480b57cec5SDimitry Andric SmallVector<Instruction *, 4> Worklist;
6649e8d8bef9SDimitry Andric append_range(Worklist, AddrDefs);
66500b57cec5SDimitry Andric while (!Worklist.empty()) {
66510b57cec5SDimitry Andric Instruction *I = Worklist.pop_back_val();
66520b57cec5SDimitry Andric for (auto &Op : I->operands())
66530b57cec5SDimitry Andric if (auto *InstOp = dyn_cast<Instruction>(Op))
66540b57cec5SDimitry Andric if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
66550b57cec5SDimitry Andric AddrDefs.insert(InstOp).second)
66560b57cec5SDimitry Andric Worklist.push_back(InstOp);
66570b57cec5SDimitry Andric }
66580b57cec5SDimitry Andric
66590b57cec5SDimitry Andric for (auto *I : AddrDefs) {
66600b57cec5SDimitry Andric if (isa<LoadInst>(I)) {
66610b57cec5SDimitry Andric // Setting the desired widening decision should ideally be handled in
66620b57cec5SDimitry Andric // by cost functions, but since this involves the task of finding out
66630b57cec5SDimitry Andric // if the loaded register is involved in an address computation, it is
66640b57cec5SDimitry Andric // instead changed here when we know this is the case.
66650b57cec5SDimitry Andric InstWidening Decision = getWideningDecision(I, VF);
66660b57cec5SDimitry Andric if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
66670b57cec5SDimitry Andric // Scalarize a widened load of address.
6668e8d8bef9SDimitry Andric setWideningDecision(
6669e8d8bef9SDimitry Andric I, VF, CM_Scalarize,
6670e8d8bef9SDimitry Andric (VF.getKnownMinValue() *
6671e8d8bef9SDimitry Andric getMemoryInstructionCost(I, ElementCount::getFixed(1))));
66720b57cec5SDimitry Andric else if (auto Group = getInterleavedAccessGroup(I)) {
66730b57cec5SDimitry Andric // Scalarize an interleave group of address loads.
66740b57cec5SDimitry Andric for (unsigned I = 0; I < Group->getFactor(); ++I) {
66750b57cec5SDimitry Andric if (Instruction *Member = Group->getMember(I))
6676e8d8bef9SDimitry Andric setWideningDecision(
6677e8d8bef9SDimitry Andric Member, VF, CM_Scalarize,
6678e8d8bef9SDimitry Andric (VF.getKnownMinValue() *
6679e8d8bef9SDimitry Andric getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
66800b57cec5SDimitry Andric }
66810b57cec5SDimitry Andric }
66820b57cec5SDimitry Andric } else
66830b57cec5SDimitry Andric // Make sure I gets scalarized and a cost estimate without
66840b57cec5SDimitry Andric // scalarization overhead.
66850b57cec5SDimitry Andric ForcedScalars[VF].insert(I);
66860b57cec5SDimitry Andric }
66870b57cec5SDimitry Andric }
66880b57cec5SDimitry Andric
setVectorizedCallDecision(ElementCount VF)6689c9157d92SDimitry Andric void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
6690c9157d92SDimitry Andric assert(!VF.isScalar() &&
6691c9157d92SDimitry Andric "Trying to set a vectorization decision for a scalar VF");
6692c9157d92SDimitry Andric
6693c9157d92SDimitry Andric for (BasicBlock *BB : TheLoop->blocks()) {
6694c9157d92SDimitry Andric // For each instruction in the old loop.
6695c9157d92SDimitry Andric for (Instruction &I : *BB) {
6696c9157d92SDimitry Andric CallInst *CI = dyn_cast<CallInst>(&I);
6697c9157d92SDimitry Andric
6698c9157d92SDimitry Andric if (!CI)
6699c9157d92SDimitry Andric continue;
6700c9157d92SDimitry Andric
6701c9157d92SDimitry Andric InstructionCost ScalarCost = InstructionCost::getInvalid();
6702c9157d92SDimitry Andric InstructionCost VectorCost = InstructionCost::getInvalid();
6703c9157d92SDimitry Andric InstructionCost IntrinsicCost = InstructionCost::getInvalid();
6704c9157d92SDimitry Andric TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6705c9157d92SDimitry Andric
6706c9157d92SDimitry Andric Function *ScalarFunc = CI->getCalledFunction();
6707c9157d92SDimitry Andric Type *ScalarRetTy = CI->getType();
6708c9157d92SDimitry Andric SmallVector<Type *, 4> Tys, ScalarTys;
6709c9157d92SDimitry Andric bool MaskRequired = Legal->isMaskRequired(CI);
6710c9157d92SDimitry Andric for (auto &ArgOp : CI->args())
6711c9157d92SDimitry Andric ScalarTys.push_back(ArgOp->getType());
6712c9157d92SDimitry Andric
6713c9157d92SDimitry Andric // Compute corresponding vector type for return value and arguments.
6714c9157d92SDimitry Andric Type *RetTy = ToVectorTy(ScalarRetTy, VF);
6715c9157d92SDimitry Andric for (Type *ScalarTy : ScalarTys)
6716c9157d92SDimitry Andric Tys.push_back(ToVectorTy(ScalarTy, VF));
6717c9157d92SDimitry Andric
6718c9157d92SDimitry Andric // An in-loop reduction using an fmuladd intrinsic is a special case;
6719c9157d92SDimitry Andric // we don't want the normal cost for that intrinsic.
6720c9157d92SDimitry Andric if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
6721c9157d92SDimitry Andric if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) {
6722c9157d92SDimitry Andric setCallWideningDecision(CI, VF, CM_IntrinsicCall, nullptr,
6723c9157d92SDimitry Andric getVectorIntrinsicIDForCall(CI, TLI),
6724c9157d92SDimitry Andric std::nullopt, *RedCost);
6725c9157d92SDimitry Andric continue;
6726c9157d92SDimitry Andric }
6727c9157d92SDimitry Andric
6728c9157d92SDimitry Andric // Estimate cost of scalarized vector call. The source operands are
6729c9157d92SDimitry Andric // assumed to be vectors, so we need to extract individual elements from
6730c9157d92SDimitry Andric // there, execute VF scalar calls, and then gather the result into the
6731c9157d92SDimitry Andric // vector return value.
6732c9157d92SDimitry Andric InstructionCost ScalarCallCost =
6733c9157d92SDimitry Andric TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
6734c9157d92SDimitry Andric
6735c9157d92SDimitry Andric // Compute costs of unpacking argument values for the scalar calls and
6736c9157d92SDimitry Andric // packing the return values to a vector.
6737c9157d92SDimitry Andric InstructionCost ScalarizationCost =
6738c9157d92SDimitry Andric getScalarizationOverhead(CI, VF, CostKind);
6739c9157d92SDimitry Andric
6740c9157d92SDimitry Andric ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
6741c9157d92SDimitry Andric
6742c9157d92SDimitry Andric // Find the cost of vectorizing the call, if we can find a suitable
6743c9157d92SDimitry Andric // vector variant of the function.
6744c9157d92SDimitry Andric bool UsesMask = false;
6745c9157d92SDimitry Andric VFInfo FuncInfo;
6746c9157d92SDimitry Andric Function *VecFunc = nullptr;
6747c9157d92SDimitry Andric // Search through any available variants for one we can use at this VF.
6748c9157d92SDimitry Andric for (VFInfo &Info : VFDatabase::getMappings(*CI)) {
6749c9157d92SDimitry Andric // Must match requested VF.
6750c9157d92SDimitry Andric if (Info.Shape.VF != VF)
6751c9157d92SDimitry Andric continue;
6752c9157d92SDimitry Andric
6753c9157d92SDimitry Andric // Must take a mask argument if one is required
6754c9157d92SDimitry Andric if (MaskRequired && !Info.isMasked())
6755c9157d92SDimitry Andric continue;
6756c9157d92SDimitry Andric
6757c9157d92SDimitry Andric // Check that all parameter kinds are supported
6758c9157d92SDimitry Andric bool ParamsOk = true;
6759c9157d92SDimitry Andric for (VFParameter Param : Info.Shape.Parameters) {
6760c9157d92SDimitry Andric switch (Param.ParamKind) {
6761c9157d92SDimitry Andric case VFParamKind::Vector:
6762c9157d92SDimitry Andric break;
6763c9157d92SDimitry Andric case VFParamKind::OMP_Uniform: {
6764c9157d92SDimitry Andric Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6765c9157d92SDimitry Andric // Make sure the scalar parameter in the loop is invariant.
6766c9157d92SDimitry Andric if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),
6767c9157d92SDimitry Andric TheLoop))
6768c9157d92SDimitry Andric ParamsOk = false;
6769c9157d92SDimitry Andric break;
6770c9157d92SDimitry Andric }
6771c9157d92SDimitry Andric case VFParamKind::OMP_Linear: {
6772c9157d92SDimitry Andric Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6773c9157d92SDimitry Andric // Find the stride for the scalar parameter in this loop and see if
6774c9157d92SDimitry Andric // it matches the stride for the variant.
6775c9157d92SDimitry Andric // TODO: do we need to figure out the cost of an extract to get the
6776c9157d92SDimitry Andric // first lane? Or do we hope that it will be folded away?
6777c9157d92SDimitry Andric ScalarEvolution *SE = PSE.getSE();
6778c9157d92SDimitry Andric const auto *SAR =
6779c9157d92SDimitry Andric dyn_cast<SCEVAddRecExpr>(SE->getSCEV(ScalarParam));
6780c9157d92SDimitry Andric
6781c9157d92SDimitry Andric if (!SAR || SAR->getLoop() != TheLoop) {
6782c9157d92SDimitry Andric ParamsOk = false;
6783c9157d92SDimitry Andric break;
6784c9157d92SDimitry Andric }
6785c9157d92SDimitry Andric
6786c9157d92SDimitry Andric const SCEVConstant *Step =
6787c9157d92SDimitry Andric dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE));
6788c9157d92SDimitry Andric
6789c9157d92SDimitry Andric if (!Step ||
6790c9157d92SDimitry Andric Step->getAPInt().getSExtValue() != Param.LinearStepOrPos)
6791c9157d92SDimitry Andric ParamsOk = false;
6792c9157d92SDimitry Andric
6793c9157d92SDimitry Andric break;
6794c9157d92SDimitry Andric }
6795c9157d92SDimitry Andric case VFParamKind::GlobalPredicate:
6796c9157d92SDimitry Andric UsesMask = true;
6797c9157d92SDimitry Andric break;
6798c9157d92SDimitry Andric default:
6799c9157d92SDimitry Andric ParamsOk = false;
6800c9157d92SDimitry Andric break;
6801c9157d92SDimitry Andric }
6802c9157d92SDimitry Andric }
6803c9157d92SDimitry Andric
6804c9157d92SDimitry Andric if (!ParamsOk)
6805c9157d92SDimitry Andric continue;
6806c9157d92SDimitry Andric
6807c9157d92SDimitry Andric // Found a suitable candidate, stop here.
6808c9157d92SDimitry Andric VecFunc = CI->getModule()->getFunction(Info.VectorName);
6809c9157d92SDimitry Andric FuncInfo = Info;
6810c9157d92SDimitry Andric break;
6811c9157d92SDimitry Andric }
6812c9157d92SDimitry Andric
6813c9157d92SDimitry Andric // Add in the cost of synthesizing a mask if one wasn't required.
6814c9157d92SDimitry Andric InstructionCost MaskCost = 0;
6815c9157d92SDimitry Andric if (VecFunc && UsesMask && !MaskRequired)
6816c9157d92SDimitry Andric MaskCost = TTI.getShuffleCost(
6817c9157d92SDimitry Andric TargetTransformInfo::SK_Broadcast,
6818c9157d92SDimitry Andric VectorType::get(IntegerType::getInt1Ty(
6819c9157d92SDimitry Andric VecFunc->getFunctionType()->getContext()),
6820c9157d92SDimitry Andric VF));
6821c9157d92SDimitry Andric
6822c9157d92SDimitry Andric if (TLI && VecFunc && !CI->isNoBuiltin())
6823c9157d92SDimitry Andric VectorCost =
6824c9157d92SDimitry Andric TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost;
6825c9157d92SDimitry Andric
6826c9157d92SDimitry Andric // Find the cost of an intrinsic; some targets may have instructions that
6827c9157d92SDimitry Andric // perform the operation without needing an actual call.
6828c9157d92SDimitry Andric Intrinsic::ID IID = getVectorIntrinsicIDForCall(CI, TLI);
6829c9157d92SDimitry Andric if (IID != Intrinsic::not_intrinsic)
6830c9157d92SDimitry Andric IntrinsicCost = getVectorIntrinsicCost(CI, VF);
6831c9157d92SDimitry Andric
6832c9157d92SDimitry Andric InstructionCost Cost = ScalarCost;
6833c9157d92SDimitry Andric InstWidening Decision = CM_Scalarize;
6834c9157d92SDimitry Andric
6835c9157d92SDimitry Andric if (VectorCost <= Cost) {
6836c9157d92SDimitry Andric Cost = VectorCost;
6837c9157d92SDimitry Andric Decision = CM_VectorCall;
6838c9157d92SDimitry Andric }
6839c9157d92SDimitry Andric
6840c9157d92SDimitry Andric if (IntrinsicCost <= Cost) {
6841c9157d92SDimitry Andric Cost = IntrinsicCost;
6842c9157d92SDimitry Andric Decision = CM_IntrinsicCall;
6843c9157d92SDimitry Andric }
6844c9157d92SDimitry Andric
6845c9157d92SDimitry Andric setCallWideningDecision(CI, VF, Decision, VecFunc, IID,
6846c9157d92SDimitry Andric FuncInfo.getParamIndexForOptionalMask(), Cost);
6847c9157d92SDimitry Andric }
6848c9157d92SDimitry Andric }
6849c9157d92SDimitry Andric }
6850c9157d92SDimitry Andric
6851e8d8bef9SDimitry Andric InstructionCost
getInstructionCost(Instruction * I,ElementCount VF,Type * & VectorTy)6852e8d8bef9SDimitry Andric LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
68530b57cec5SDimitry Andric Type *&VectorTy) {
68540b57cec5SDimitry Andric Type *RetTy = I->getType();
68550b57cec5SDimitry Andric if (canTruncateToMinimalBitwidth(I, VF))
68560b57cec5SDimitry Andric RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
68570b57cec5SDimitry Andric auto SE = PSE.getSE();
68585ffd83dbSDimitry Andric TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
68590b57cec5SDimitry Andric
6860fe6060f1SDimitry Andric auto hasSingleCopyAfterVectorization = [this](Instruction *I,
6861fe6060f1SDimitry Andric ElementCount VF) -> bool {
6862fe6060f1SDimitry Andric if (VF.isScalar())
6863fe6060f1SDimitry Andric return true;
6864fe6060f1SDimitry Andric
6865fe6060f1SDimitry Andric auto Scalarized = InstsToScalarize.find(VF);
6866fe6060f1SDimitry Andric assert(Scalarized != InstsToScalarize.end() &&
6867fe6060f1SDimitry Andric "VF not yet analyzed for scalarization profitability");
6868fe6060f1SDimitry Andric return !Scalarized->second.count(I) &&
6869fe6060f1SDimitry Andric llvm::all_of(I->users(), [&](User *U) {
6870fe6060f1SDimitry Andric auto *UI = cast<Instruction>(U);
6871fe6060f1SDimitry Andric return !Scalarized->second.count(UI);
6872fe6060f1SDimitry Andric });
6873fe6060f1SDimitry Andric };
6874fe6060f1SDimitry Andric (void) hasSingleCopyAfterVectorization;
6875fe6060f1SDimitry Andric
6876fe6060f1SDimitry Andric if (isScalarAfterVectorization(I, VF)) {
6877fe6060f1SDimitry Andric // With the exception of GEPs and PHIs, after scalarization there should
6878fe6060f1SDimitry Andric // only be one copy of the instruction generated in the loop. This is
6879fe6060f1SDimitry Andric // because the VF is either 1, or any instructions that need scalarizing
6880c9157d92SDimitry Andric // have already been dealt with by the time we get here. As a result,
6881fe6060f1SDimitry Andric // it means we don't have to multiply the instruction cost by VF.
6882fe6060f1SDimitry Andric assert(I->getOpcode() == Instruction::GetElementPtr ||
6883fe6060f1SDimitry Andric I->getOpcode() == Instruction::PHI ||
6884fe6060f1SDimitry Andric (I->getOpcode() == Instruction::BitCast &&
6885fe6060f1SDimitry Andric I->getType()->isPointerTy()) ||
6886fe6060f1SDimitry Andric hasSingleCopyAfterVectorization(I, VF));
6887fe6060f1SDimitry Andric VectorTy = RetTy;
6888fe6060f1SDimitry Andric } else
6889fe6060f1SDimitry Andric VectorTy = ToVectorTy(RetTy, VF);
6890fe6060f1SDimitry Andric
68910b57cec5SDimitry Andric // TODO: We need to estimate the cost of intrinsic calls.
68920b57cec5SDimitry Andric switch (I->getOpcode()) {
68930b57cec5SDimitry Andric case Instruction::GetElementPtr:
68940b57cec5SDimitry Andric // We mark this instruction as zero-cost because the cost of GEPs in
68950b57cec5SDimitry Andric // vectorized code depends on whether the corresponding memory instruction
68960b57cec5SDimitry Andric // is scalarized or not. Therefore, we handle GEPs with the memory
68970b57cec5SDimitry Andric // instruction cost.
68980b57cec5SDimitry Andric return 0;
68990b57cec5SDimitry Andric case Instruction::Br: {
69000b57cec5SDimitry Andric // In cases of scalarized and predicated instructions, there will be VF
69010b57cec5SDimitry Andric // predicated blocks in the vectorized loop. Each branch around these
69020b57cec5SDimitry Andric // blocks requires also an extract of its vector compare i1 element.
69030b57cec5SDimitry Andric bool ScalarPredicatedBB = false;
69040b57cec5SDimitry Andric BranchInst *BI = cast<BranchInst>(I);
6905e8d8bef9SDimitry Andric if (VF.isVector() && BI->isConditional() &&
6906753f127fSDimitry Andric (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
6907753f127fSDimitry Andric PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))))
69080b57cec5SDimitry Andric ScalarPredicatedBB = true;
69090b57cec5SDimitry Andric
69100b57cec5SDimitry Andric if (ScalarPredicatedBB) {
6911fe6060f1SDimitry Andric // Not possible to scalarize scalable vector with predicated instructions.
6912fe6060f1SDimitry Andric if (VF.isScalable())
6913fe6060f1SDimitry Andric return InstructionCost::getInvalid();
69140b57cec5SDimitry Andric // Return cost for branches around scalarized and predicated blocks.
69155ffd83dbSDimitry Andric auto *Vec_i1Ty =
6916e8d8bef9SDimitry Andric VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6917fe6060f1SDimitry Andric return (
6918fe6060f1SDimitry Andric TTI.getScalarizationOverhead(
6919bdd1243dSDimitry Andric Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()),
6920bdd1243dSDimitry Andric /*Insert*/ false, /*Extract*/ true, CostKind) +
6921fe6060f1SDimitry Andric (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
6922e8d8bef9SDimitry Andric } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
69230b57cec5SDimitry Andric // The back-edge branch will remain, as will all scalar branches.
69245ffd83dbSDimitry Andric return TTI.getCFInstrCost(Instruction::Br, CostKind);
69250b57cec5SDimitry Andric else
69260b57cec5SDimitry Andric // This branch will be eliminated by if-conversion.
69270b57cec5SDimitry Andric return 0;
69280b57cec5SDimitry Andric // Note: We currently assume zero cost for an unconditional branch inside
69290b57cec5SDimitry Andric // a predicated block since it will become a fall-through, although we
69300b57cec5SDimitry Andric // may decide in the future to call TTI for all branches.
69310b57cec5SDimitry Andric }
69320b57cec5SDimitry Andric case Instruction::PHI: {
69330b57cec5SDimitry Andric auto *Phi = cast<PHINode>(I);
69340b57cec5SDimitry Andric
69350b57cec5SDimitry Andric // First-order recurrences are replaced by vector shuffles inside the loop.
6936bdd1243dSDimitry Andric if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6937bdd1243dSDimitry Andric SmallVector<int> Mask(VF.getKnownMinValue());
6938bdd1243dSDimitry Andric std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
6939bdd1243dSDimitry Andric return TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
6940bdd1243dSDimitry Andric cast<VectorType>(VectorTy), Mask, CostKind,
6941bdd1243dSDimitry Andric VF.getKnownMinValue() - 1);
6942bdd1243dSDimitry Andric }
69430b57cec5SDimitry Andric
69440b57cec5SDimitry Andric // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
69450b57cec5SDimitry Andric // converted into select instructions. We require N - 1 selects per phi
69460b57cec5SDimitry Andric // node, where N is the number of incoming values.
6947e8d8bef9SDimitry Andric if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
69480b57cec5SDimitry Andric return (Phi->getNumIncomingValues() - 1) *
69490b57cec5SDimitry Andric TTI.getCmpSelInstrCost(
69500b57cec5SDimitry Andric Instruction::Select, ToVectorTy(Phi->getType(), VF),
69515ffd83dbSDimitry Andric ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6952e8d8bef9SDimitry Andric CmpInst::BAD_ICMP_PREDICATE, CostKind);
69530b57cec5SDimitry Andric
69545ffd83dbSDimitry Andric return TTI.getCFInstrCost(Instruction::PHI, CostKind);
69550b57cec5SDimitry Andric }
69560b57cec5SDimitry Andric case Instruction::UDiv:
69570b57cec5SDimitry Andric case Instruction::SDiv:
69580b57cec5SDimitry Andric case Instruction::URem:
69590b57cec5SDimitry Andric case Instruction::SRem:
6960bdd1243dSDimitry Andric if (VF.isVector() && isPredicatedInst(I)) {
6961bdd1243dSDimitry Andric const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6962bdd1243dSDimitry Andric return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6963bdd1243dSDimitry Andric ScalarCost : SafeDivisorCost;
69640b57cec5SDimitry Andric }
6965bdd1243dSDimitry Andric // We've proven all lanes safe to speculate, fall through.
6966bdd1243dSDimitry Andric [[fallthrough]];
69670b57cec5SDimitry Andric case Instruction::Add:
69680b57cec5SDimitry Andric case Instruction::FAdd:
69690b57cec5SDimitry Andric case Instruction::Sub:
69700b57cec5SDimitry Andric case Instruction::FSub:
69710b57cec5SDimitry Andric case Instruction::Mul:
69720b57cec5SDimitry Andric case Instruction::FMul:
69730b57cec5SDimitry Andric case Instruction::FDiv:
69740b57cec5SDimitry Andric case Instruction::FRem:
69750b57cec5SDimitry Andric case Instruction::Shl:
69760b57cec5SDimitry Andric case Instruction::LShr:
69770b57cec5SDimitry Andric case Instruction::AShr:
69780b57cec5SDimitry Andric case Instruction::And:
69790b57cec5SDimitry Andric case Instruction::Or:
69800b57cec5SDimitry Andric case Instruction::Xor: {
6981fe013be4SDimitry Andric // If we're speculating on the stride being 1, the multiplication may
6982fe013be4SDimitry Andric // fold away. We can generalize this for all operations using the notion
6983fe013be4SDimitry Andric // of neutral elements. (TODO)
6984fe013be4SDimitry Andric if (I->getOpcode() == Instruction::Mul &&
6985fe013be4SDimitry Andric (PSE.getSCEV(I->getOperand(0))->isOne() ||
6986fe013be4SDimitry Andric PSE.getSCEV(I->getOperand(1))->isOne()))
69870b57cec5SDimitry Andric return 0;
6988e8d8bef9SDimitry Andric
6989e8d8bef9SDimitry Andric // Detect reduction patterns
6990fe6060f1SDimitry Andric if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
6991fe6060f1SDimitry Andric return *RedCost;
6992e8d8bef9SDimitry Andric
69930b57cec5SDimitry Andric // Certain instructions can be cheaper to vectorize if they have a constant
69940b57cec5SDimitry Andric // second vector operand. One example of this are shifts on x86.
69950b57cec5SDimitry Andric Value *Op2 = I->getOperand(1);
6996bdd1243dSDimitry Andric auto Op2Info = TTI.getOperandInfo(Op2);
6997fe013be4SDimitry Andric if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6998fe013be4SDimitry Andric Legal->isInvariant(Op2))
6999bdd1243dSDimitry Andric Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
70000b57cec5SDimitry Andric
70010b57cec5SDimitry Andric SmallVector<const Value *, 4> Operands(I->operand_values());
7002a58f00eaSDimitry Andric auto InstrCost = TTI.getArithmeticInstrCost(
7003bdd1243dSDimitry Andric I->getOpcode(), VectorTy, CostKind,
7004bdd1243dSDimitry Andric {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
7005bdd1243dSDimitry Andric Op2Info, Operands, I);
7006a58f00eaSDimitry Andric
7007a58f00eaSDimitry Andric // Some targets can replace frem with vector library calls.
7008a58f00eaSDimitry Andric InstructionCost VecCallCost = InstructionCost::getInvalid();
7009a58f00eaSDimitry Andric if (I->getOpcode() == Instruction::FRem) {
7010a58f00eaSDimitry Andric LibFunc Func;
7011a58f00eaSDimitry Andric if (TLI->getLibFunc(I->getOpcode(), I->getType(), Func) &&
7012a58f00eaSDimitry Andric TLI->isFunctionVectorizable(TLI->getName(Func), VF)) {
7013a58f00eaSDimitry Andric SmallVector<Type *, 4> OpTypes;
7014a58f00eaSDimitry Andric for (auto &Op : I->operands())
7015a58f00eaSDimitry Andric OpTypes.push_back(Op->getType());
7016a58f00eaSDimitry Andric VecCallCost =
7017a58f00eaSDimitry Andric TTI.getCallInstrCost(nullptr, VectorTy, OpTypes, CostKind);
7018a58f00eaSDimitry Andric }
7019a58f00eaSDimitry Andric }
7020a58f00eaSDimitry Andric return std::min(InstrCost, VecCallCost);
70210b57cec5SDimitry Andric }
70220b57cec5SDimitry Andric case Instruction::FNeg: {
7023fe6060f1SDimitry Andric return TTI.getArithmeticInstrCost(
7024bdd1243dSDimitry Andric I->getOpcode(), VectorTy, CostKind,
7025bdd1243dSDimitry Andric {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
7026bdd1243dSDimitry Andric {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
7027bdd1243dSDimitry Andric I->getOperand(0), I);
70280b57cec5SDimitry Andric }
70290b57cec5SDimitry Andric case Instruction::Select: {
70300b57cec5SDimitry Andric SelectInst *SI = cast<SelectInst>(I);
70310b57cec5SDimitry Andric const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
70320b57cec5SDimitry Andric bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7033fe6060f1SDimitry Andric
7034fe6060f1SDimitry Andric const Value *Op0, *Op1;
7035fe6060f1SDimitry Andric using namespace llvm::PatternMatch;
7036fe6060f1SDimitry Andric if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
7037fe6060f1SDimitry Andric match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
7038fe6060f1SDimitry Andric // select x, y, false --> x & y
7039fe6060f1SDimitry Andric // select x, true, y --> x | y
7040bdd1243dSDimitry Andric const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);
7041bdd1243dSDimitry Andric const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);
7042fe6060f1SDimitry Andric assert(Op0->getType()->getScalarSizeInBits() == 1 &&
7043fe6060f1SDimitry Andric Op1->getType()->getScalarSizeInBits() == 1);
7044fe6060f1SDimitry Andric
7045fe6060f1SDimitry Andric SmallVector<const Value *, 2> Operands{Op0, Op1};
7046fe6060f1SDimitry Andric return TTI.getArithmeticInstrCost(
7047fe6060f1SDimitry Andric match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
7048bdd1243dSDimitry Andric CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I);
7049fe6060f1SDimitry Andric }
7050fe6060f1SDimitry Andric
70510b57cec5SDimitry Andric Type *CondTy = SI->getCondition()->getType();
70520b57cec5SDimitry Andric if (!ScalarCond)
7053e8d8bef9SDimitry Andric CondTy = VectorType::get(CondTy, VF);
70540eae32dcSDimitry Andric
70550eae32dcSDimitry Andric CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
70560eae32dcSDimitry Andric if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
70570eae32dcSDimitry Andric Pred = Cmp->getPredicate();
70580eae32dcSDimitry Andric return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
70590eae32dcSDimitry Andric CostKind, I);
70600b57cec5SDimitry Andric }
70610b57cec5SDimitry Andric case Instruction::ICmp:
70620b57cec5SDimitry Andric case Instruction::FCmp: {
70630b57cec5SDimitry Andric Type *ValTy = I->getOperand(0)->getType();
70640b57cec5SDimitry Andric Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
70650b57cec5SDimitry Andric if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
70660b57cec5SDimitry Andric ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
70670b57cec5SDimitry Andric VectorTy = ToVectorTy(ValTy, VF);
7068e8d8bef9SDimitry Andric return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
70690eae32dcSDimitry Andric cast<CmpInst>(I)->getPredicate(), CostKind,
70700eae32dcSDimitry Andric I);
70710b57cec5SDimitry Andric }
70720b57cec5SDimitry Andric case Instruction::Store:
70730b57cec5SDimitry Andric case Instruction::Load: {
7074e8d8bef9SDimitry Andric ElementCount Width = VF;
7075e8d8bef9SDimitry Andric if (Width.isVector()) {
70760b57cec5SDimitry Andric InstWidening Decision = getWideningDecision(I, Width);
70770b57cec5SDimitry Andric assert(Decision != CM_Unknown &&
70780b57cec5SDimitry Andric "CM decision should be taken at this point");
7079fcaf7f86SDimitry Andric if (getWideningCost(I, VF) == InstructionCost::getInvalid())
708081ad6265SDimitry Andric return InstructionCost::getInvalid();
7081fcaf7f86SDimitry Andric if (Decision == CM_Scalarize)
7082e8d8bef9SDimitry Andric Width = ElementCount::getFixed(1);
70830b57cec5SDimitry Andric }
7084fe6060f1SDimitry Andric VectorTy = ToVectorTy(getLoadStoreType(I), Width);
70850b57cec5SDimitry Andric return getMemoryInstructionCost(I, VF);
70860b57cec5SDimitry Andric }
7087fe6060f1SDimitry Andric case Instruction::BitCast:
7088fe6060f1SDimitry Andric if (I->getType()->isPointerTy())
7089fe6060f1SDimitry Andric return 0;
7090bdd1243dSDimitry Andric [[fallthrough]];
70910b57cec5SDimitry Andric case Instruction::ZExt:
70920b57cec5SDimitry Andric case Instruction::SExt:
70930b57cec5SDimitry Andric case Instruction::FPToUI:
70940b57cec5SDimitry Andric case Instruction::FPToSI:
70950b57cec5SDimitry Andric case Instruction::FPExt:
70960b57cec5SDimitry Andric case Instruction::PtrToInt:
70970b57cec5SDimitry Andric case Instruction::IntToPtr:
70980b57cec5SDimitry Andric case Instruction::SIToFP:
70990b57cec5SDimitry Andric case Instruction::UIToFP:
71000b57cec5SDimitry Andric case Instruction::Trunc:
7101fe6060f1SDimitry Andric case Instruction::FPTrunc: {
7102e8d8bef9SDimitry Andric // Computes the CastContextHint from a Load/Store instruction.
7103e8d8bef9SDimitry Andric auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7104e8d8bef9SDimitry Andric assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7105e8d8bef9SDimitry Andric "Expected a load or a store!");
7106e8d8bef9SDimitry Andric
7107e8d8bef9SDimitry Andric if (VF.isScalar() || !TheLoop->contains(I))
7108e8d8bef9SDimitry Andric return TTI::CastContextHint::Normal;
7109e8d8bef9SDimitry Andric
7110e8d8bef9SDimitry Andric switch (getWideningDecision(I, VF)) {
7111e8d8bef9SDimitry Andric case LoopVectorizationCostModel::CM_GatherScatter:
7112e8d8bef9SDimitry Andric return TTI::CastContextHint::GatherScatter;
7113e8d8bef9SDimitry Andric case LoopVectorizationCostModel::CM_Interleave:
7114e8d8bef9SDimitry Andric return TTI::CastContextHint::Interleave;
7115e8d8bef9SDimitry Andric case LoopVectorizationCostModel::CM_Scalarize:
7116e8d8bef9SDimitry Andric case LoopVectorizationCostModel::CM_Widen:
7117e8d8bef9SDimitry Andric return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7118e8d8bef9SDimitry Andric : TTI::CastContextHint::Normal;
7119e8d8bef9SDimitry Andric case LoopVectorizationCostModel::CM_Widen_Reverse:
7120e8d8bef9SDimitry Andric return TTI::CastContextHint::Reversed;
7121e8d8bef9SDimitry Andric case LoopVectorizationCostModel::CM_Unknown:
7122e8d8bef9SDimitry Andric llvm_unreachable("Instr did not go through cost modelling?");
7123c9157d92SDimitry Andric case LoopVectorizationCostModel::CM_VectorCall:
7124c9157d92SDimitry Andric case LoopVectorizationCostModel::CM_IntrinsicCall:
7125c9157d92SDimitry Andric llvm_unreachable_internal("Instr has invalid widening decision");
7126e8d8bef9SDimitry Andric }
7127e8d8bef9SDimitry Andric
7128e8d8bef9SDimitry Andric llvm_unreachable("Unhandled case!");
7129e8d8bef9SDimitry Andric };
7130e8d8bef9SDimitry Andric
7131e8d8bef9SDimitry Andric unsigned Opcode = I->getOpcode();
7132e8d8bef9SDimitry Andric TTI::CastContextHint CCH = TTI::CastContextHint::None;
7133e8d8bef9SDimitry Andric // For Trunc, the context is the only user, which must be a StoreInst.
7134e8d8bef9SDimitry Andric if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7135e8d8bef9SDimitry Andric if (I->hasOneUse())
7136e8d8bef9SDimitry Andric if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7137e8d8bef9SDimitry Andric CCH = ComputeCCH(Store);
7138e8d8bef9SDimitry Andric }
7139e8d8bef9SDimitry Andric // For Z/Sext, the context is the operand, which must be a LoadInst.
7140e8d8bef9SDimitry Andric else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7141e8d8bef9SDimitry Andric Opcode == Instruction::FPExt) {
7142e8d8bef9SDimitry Andric if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7143e8d8bef9SDimitry Andric CCH = ComputeCCH(Load);
7144e8d8bef9SDimitry Andric }
7145e8d8bef9SDimitry Andric
71460b57cec5SDimitry Andric // We optimize the truncation of induction variables having constant
71470b57cec5SDimitry Andric // integer steps. The cost of these truncations is the same as the scalar
71480b57cec5SDimitry Andric // operation.
71490b57cec5SDimitry Andric if (isOptimizableIVTruncate(I, VF)) {
71500b57cec5SDimitry Andric auto *Trunc = cast<TruncInst>(I);
71510b57cec5SDimitry Andric return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7152e8d8bef9SDimitry Andric Trunc->getSrcTy(), CCH, CostKind, Trunc);
71530b57cec5SDimitry Andric }
71540b57cec5SDimitry Andric
7155e8d8bef9SDimitry Andric // Detect reduction patterns
7156fe6060f1SDimitry Andric if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7157fe6060f1SDimitry Andric return *RedCost;
7158e8d8bef9SDimitry Andric
71590b57cec5SDimitry Andric Type *SrcScalarTy = I->getOperand(0)->getType();
71600b57cec5SDimitry Andric Type *SrcVecTy =
71610b57cec5SDimitry Andric VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
71620b57cec5SDimitry Andric if (canTruncateToMinimalBitwidth(I, VF)) {
71630b57cec5SDimitry Andric // This cast is going to be shrunk. This may remove the cast or it might
71640b57cec5SDimitry Andric // turn it into slightly different cast. For example, if MinBW == 16,
71650b57cec5SDimitry Andric // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
71660b57cec5SDimitry Andric //
71670b57cec5SDimitry Andric // Calculate the modified src and dest types.
71680b57cec5SDimitry Andric Type *MinVecTy = VectorTy;
7169e8d8bef9SDimitry Andric if (Opcode == Instruction::Trunc) {
71700b57cec5SDimitry Andric SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
71710b57cec5SDimitry Andric VectorTy =
71720b57cec5SDimitry Andric largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7173e8d8bef9SDimitry Andric } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7174fe013be4SDimitry Andric // Leave SrcVecTy unchanged - we only shrink the destination element
7175fe013be4SDimitry Andric // type.
71760b57cec5SDimitry Andric VectorTy =
71770b57cec5SDimitry Andric smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
71780b57cec5SDimitry Andric }
71790b57cec5SDimitry Andric }
71800b57cec5SDimitry Andric
7181fe6060f1SDimitry Andric return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
71820b57cec5SDimitry Andric }
7183c9157d92SDimitry Andric case Instruction::Call:
7184c9157d92SDimitry Andric return getVectorCallCost(cast<CallInst>(I), VF);
7185e8d8bef9SDimitry Andric case Instruction::ExtractValue:
7186e8d8bef9SDimitry Andric return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7187fe6060f1SDimitry Andric case Instruction::Alloca:
7188fe6060f1SDimitry Andric // We cannot easily widen alloca to a scalable alloca, as
7189fe6060f1SDimitry Andric // the result would need to be a vector of pointers.
7190fe6060f1SDimitry Andric if (VF.isScalable())
7191fe6060f1SDimitry Andric return InstructionCost::getInvalid();
7192bdd1243dSDimitry Andric [[fallthrough]];
71930b57cec5SDimitry Andric default:
7194fe6060f1SDimitry Andric // This opcode is unknown. Assume that it is the same as 'mul'.
7195fe6060f1SDimitry Andric return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
71960b57cec5SDimitry Andric } // end of switch.
71970b57cec5SDimitry Andric }
71980b57cec5SDimitry Andric
collectValuesToIgnore()71990b57cec5SDimitry Andric void LoopVectorizationCostModel::collectValuesToIgnore() {
72000b57cec5SDimitry Andric // Ignore ephemeral values.
72010b57cec5SDimitry Andric CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
72020b57cec5SDimitry Andric
720381ad6265SDimitry Andric // Find all stores to invariant variables. Since they are going to sink
720481ad6265SDimitry Andric // outside the loop we do not need calculate cost for them.
720581ad6265SDimitry Andric for (BasicBlock *BB : TheLoop->blocks())
720681ad6265SDimitry Andric for (Instruction &I : *BB) {
720781ad6265SDimitry Andric StoreInst *SI;
720881ad6265SDimitry Andric if ((SI = dyn_cast<StoreInst>(&I)) &&
720981ad6265SDimitry Andric Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
721081ad6265SDimitry Andric ValuesToIgnore.insert(&I);
721181ad6265SDimitry Andric }
721281ad6265SDimitry Andric
72130b57cec5SDimitry Andric // Ignore type-promoting instructions we identified during reduction
72140b57cec5SDimitry Andric // detection.
7215bdd1243dSDimitry Andric for (const auto &Reduction : Legal->getReductionVars()) {
72160eae32dcSDimitry Andric const RecurrenceDescriptor &RedDes = Reduction.second;
7217e8d8bef9SDimitry Andric const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
72180b57cec5SDimitry Andric VecValuesToIgnore.insert(Casts.begin(), Casts.end());
72190b57cec5SDimitry Andric }
72200b57cec5SDimitry Andric // Ignore type-casting instructions we identified during induction
72210b57cec5SDimitry Andric // detection.
7222bdd1243dSDimitry Andric for (const auto &Induction : Legal->getInductionVars()) {
72230eae32dcSDimitry Andric const InductionDescriptor &IndDes = Induction.second;
72240b57cec5SDimitry Andric const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
72250b57cec5SDimitry Andric VecValuesToIgnore.insert(Casts.begin(), Casts.end());
72260b57cec5SDimitry Andric }
72270b57cec5SDimitry Andric }
72280b57cec5SDimitry Andric
collectInLoopReductions()7229e8d8bef9SDimitry Andric void LoopVectorizationCostModel::collectInLoopReductions() {
7230bdd1243dSDimitry Andric for (const auto &Reduction : Legal->getReductionVars()) {
7231e8d8bef9SDimitry Andric PHINode *Phi = Reduction.first;
72320eae32dcSDimitry Andric const RecurrenceDescriptor &RdxDesc = Reduction.second;
7233e8d8bef9SDimitry Andric
7234e8d8bef9SDimitry Andric // We don't collect reductions that are type promoted (yet).
7235e8d8bef9SDimitry Andric if (RdxDesc.getRecurrenceType() != Phi->getType())
7236e8d8bef9SDimitry Andric continue;
7237e8d8bef9SDimitry Andric
7238e8d8bef9SDimitry Andric // If the target would prefer this reduction to happen "in-loop", then we
7239e8d8bef9SDimitry Andric // want to record it as such.
7240e8d8bef9SDimitry Andric unsigned Opcode = RdxDesc.getOpcode();
7241fe6060f1SDimitry Andric if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7242e8d8bef9SDimitry Andric !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7243e8d8bef9SDimitry Andric TargetTransformInfo::ReductionFlags()))
7244e8d8bef9SDimitry Andric continue;
7245e8d8bef9SDimitry Andric
7246e8d8bef9SDimitry Andric // Check that we can correctly put the reductions into the loop, by
7247e8d8bef9SDimitry Andric // finding the chain of operations that leads from the phi to the loop
7248e8d8bef9SDimitry Andric // exit value.
7249e8d8bef9SDimitry Andric SmallVector<Instruction *, 4> ReductionOperations =
7250e8d8bef9SDimitry Andric RdxDesc.getReductionOpChain(Phi, TheLoop);
7251e8d8bef9SDimitry Andric bool InLoop = !ReductionOperations.empty();
7252c9157d92SDimitry Andric
7253e8d8bef9SDimitry Andric if (InLoop) {
7254c9157d92SDimitry Andric InLoopReductions.insert(Phi);
7255e8d8bef9SDimitry Andric // Add the elements to InLoopReductionImmediateChains for cost modelling.
7256e8d8bef9SDimitry Andric Instruction *LastChain = Phi;
7257e8d8bef9SDimitry Andric for (auto *I : ReductionOperations) {
7258e8d8bef9SDimitry Andric InLoopReductionImmediateChains[I] = LastChain;
7259e8d8bef9SDimitry Andric LastChain = I;
7260e8d8bef9SDimitry Andric }
7261e8d8bef9SDimitry Andric }
7262e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7263e8d8bef9SDimitry Andric << " reduction for phi: " << *Phi << "\n");
7264e8d8bef9SDimitry Andric }
7265e8d8bef9SDimitry Andric }
7266e8d8bef9SDimitry Andric
createICmp(CmpInst::Predicate Pred,VPValue * A,VPValue * B,DebugLoc DL,const Twine & Name)7267c9157d92SDimitry Andric VPValue *VPBuilder::createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B,
7268c9157d92SDimitry Andric DebugLoc DL, const Twine &Name) {
7269c9157d92SDimitry Andric assert(Pred >= CmpInst::FIRST_ICMP_PREDICATE &&
7270c9157d92SDimitry Andric Pred <= CmpInst::LAST_ICMP_PREDICATE && "invalid predicate");
7271c9157d92SDimitry Andric return tryInsertInstruction(
7272c9157d92SDimitry Andric new VPInstruction(Instruction::ICmp, Pred, A, B, DL, Name));
7273c9157d92SDimitry Andric }
7274c9157d92SDimitry Andric
7275c9157d92SDimitry Andric // This function will select a scalable VF if the target supports scalable
7276c9157d92SDimitry Andric // vectors and a fixed one otherwise.
72770b57cec5SDimitry Andric // TODO: we could return a pair of values that specify the max VF and
72780b57cec5SDimitry Andric // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
72790b57cec5SDimitry Andric // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
72800b57cec5SDimitry Andric // doesn't have a cost model that can choose which plan to execute if
72810b57cec5SDimitry Andric // more than one is generated.
determineVPlanVF(const TargetTransformInfo & TTI,LoopVectorizationCostModel & CM)7282c9157d92SDimitry Andric static ElementCount determineVPlanVF(const TargetTransformInfo &TTI,
72830b57cec5SDimitry Andric LoopVectorizationCostModel &CM) {
72840b57cec5SDimitry Andric unsigned WidestType;
72850b57cec5SDimitry Andric std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7286c9157d92SDimitry Andric
7287c9157d92SDimitry Andric TargetTransformInfo::RegisterKind RegKind =
7288c9157d92SDimitry Andric TTI.enableScalableVectorization()
7289c9157d92SDimitry Andric ? TargetTransformInfo::RGK_ScalableVector
7290c9157d92SDimitry Andric : TargetTransformInfo::RGK_FixedWidthVector;
7291c9157d92SDimitry Andric
7292c9157d92SDimitry Andric TypeSize RegSize = TTI.getRegisterBitWidth(RegKind);
7293c9157d92SDimitry Andric unsigned N = RegSize.getKnownMinValue() / WidestType;
7294c9157d92SDimitry Andric return ElementCount::get(N, RegSize.isScalable());
72950b57cec5SDimitry Andric }
72960b57cec5SDimitry Andric
72970b57cec5SDimitry Andric VectorizationFactor
planInVPlanNativePath(ElementCount UserVF)7298e8d8bef9SDimitry Andric LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7299e8d8bef9SDimitry Andric ElementCount VF = UserVF;
73000b57cec5SDimitry Andric // Outer loop handling: They may require CFG and instruction level
73010b57cec5SDimitry Andric // transformations before even evaluating whether vectorization is profitable.
73020b57cec5SDimitry Andric // Since we cannot modify the incoming IR, we need to build VPlan upfront in
73030b57cec5SDimitry Andric // the vectorization pipeline.
7304e8d8bef9SDimitry Andric if (!OrigLoop->isInnermost()) {
73050b57cec5SDimitry Andric // If the user doesn't provide a vectorization factor, determine a
73060b57cec5SDimitry Andric // reasonable one.
7307e8d8bef9SDimitry Andric if (UserVF.isZero()) {
7308c9157d92SDimitry Andric VF = determineVPlanVF(TTI, CM);
73090b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
73100b57cec5SDimitry Andric
73110b57cec5SDimitry Andric // Make sure we have a VF > 1 for stress testing.
7312e8d8bef9SDimitry Andric if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
73130b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
73140b57cec5SDimitry Andric << "overriding computed VF.\n");
7315e8d8bef9SDimitry Andric VF = ElementCount::getFixed(4);
73160b57cec5SDimitry Andric }
7317c9157d92SDimitry Andric } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
7318c9157d92SDimitry Andric !ForceTargetSupportsScalableVectors) {
7319c9157d92SDimitry Andric LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
7320c9157d92SDimitry Andric << "not supported by the target.\n");
7321c9157d92SDimitry Andric reportVectorizationFailure(
7322c9157d92SDimitry Andric "Scalable vectorization requested but not supported by the target",
7323c9157d92SDimitry Andric "the scalable user-specified vectorization width for outer-loop "
7324c9157d92SDimitry Andric "vectorization cannot be used because the target does not support "
7325c9157d92SDimitry Andric "scalable vectors.",
7326c9157d92SDimitry Andric "ScalableVFUnfeasible", ORE, OrigLoop);
7327c9157d92SDimitry Andric return VectorizationFactor::Disabled();
73280b57cec5SDimitry Andric }
73290b57cec5SDimitry Andric assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7330e8d8bef9SDimitry Andric assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7331e8d8bef9SDimitry Andric "VF needs to be a power of two");
7332e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7333e8d8bef9SDimitry Andric << "VF " << VF << " to build VPlans.\n");
73340b57cec5SDimitry Andric buildVPlans(VF, VF);
73350b57cec5SDimitry Andric
73360b57cec5SDimitry Andric // For VPlan build stress testing, we bail out after VPlan construction.
73370b57cec5SDimitry Andric if (VPlanBuildStressTest)
73380b57cec5SDimitry Andric return VectorizationFactor::Disabled();
73390b57cec5SDimitry Andric
734081ad6265SDimitry Andric return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
73410b57cec5SDimitry Andric }
73420b57cec5SDimitry Andric
73430b57cec5SDimitry Andric LLVM_DEBUG(
73440b57cec5SDimitry Andric dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
73450b57cec5SDimitry Andric "VPlan-native path.\n");
73460b57cec5SDimitry Andric return VectorizationFactor::Disabled();
73470b57cec5SDimitry Andric }
73480b57cec5SDimitry Andric
7349bdd1243dSDimitry Andric std::optional<VectorizationFactor>
plan(ElementCount UserVF,unsigned UserIC)7350e8d8bef9SDimitry Andric LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7351e8d8bef9SDimitry Andric assert(OrigLoop->isInnermost() && "Inner loop expected.");
7352fe013be4SDimitry Andric CM.collectValuesToIgnore();
7353fe013be4SDimitry Andric CM.collectElementTypesForWidening();
7354fe013be4SDimitry Andric
7355fe6060f1SDimitry Andric FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7356fe6060f1SDimitry Andric if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7357bdd1243dSDimitry Andric return std::nullopt;
73580b57cec5SDimitry Andric
73590b57cec5SDimitry Andric // Invalidate interleave groups if all blocks of loop will be predicated.
7360349cc55cSDimitry Andric if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7361fe013be4SDimitry Andric !useMaskedInterleavedAccesses(TTI)) {
73620b57cec5SDimitry Andric LLVM_DEBUG(
73630b57cec5SDimitry Andric dbgs()
73640b57cec5SDimitry Andric << "LV: Invalidate all interleaved groups due to fold-tail by masking "
73650b57cec5SDimitry Andric "which requires masked-interleaved support.\n");
73665ffd83dbSDimitry Andric if (CM.InterleaveInfo.invalidateGroups())
73675ffd83dbSDimitry Andric // Invalidating interleave groups also requires invalidating all decisions
73685ffd83dbSDimitry Andric // based on them, which includes widening decisions and uniform and scalar
73695ffd83dbSDimitry Andric // values.
73705ffd83dbSDimitry Andric CM.invalidateCostModelingDecisions();
73710b57cec5SDimitry Andric }
73720b57cec5SDimitry Andric
7373fe6060f1SDimitry Andric ElementCount MaxUserVF =
7374fe6060f1SDimitry Andric UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7375fe6060f1SDimitry Andric bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
7376fe6060f1SDimitry Andric if (!UserVF.isZero() && UserVFIsLegal) {
7377fe6060f1SDimitry Andric assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
7378e8d8bef9SDimitry Andric "VF needs to be a power of two");
73790b57cec5SDimitry Andric // Collect the instructions (and their associated costs) that will be more
73800b57cec5SDimitry Andric // profitable to scalarize.
7381c9157d92SDimitry Andric CM.collectInLoopReductions();
7382fe6060f1SDimitry Andric if (CM.selectUserVectorizationFactor(UserVF)) {
7383fe6060f1SDimitry Andric LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7384fe6060f1SDimitry Andric buildVPlansWithVPRecipes(UserVF, UserVF);
7385fe013be4SDimitry Andric if (!hasPlanWithVF(UserVF)) {
7386fe013be4SDimitry Andric LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF
7387fe013be4SDimitry Andric << ".\n");
7388fe013be4SDimitry Andric return std::nullopt;
7389fe013be4SDimitry Andric }
7390fe013be4SDimitry Andric
73910b57cec5SDimitry Andric LLVM_DEBUG(printPlans(dbgs()));
739281ad6265SDimitry Andric return {{UserVF, 0, 0}};
7393fe6060f1SDimitry Andric } else
7394fe6060f1SDimitry Andric reportVectorizationInfo("UserVF ignored because of invalid costs.",
7395fe6060f1SDimitry Andric "InvalidCost", ORE, OrigLoop);
73960b57cec5SDimitry Andric }
73970b57cec5SDimitry Andric
7398fe6060f1SDimitry Andric // Populate the set of Vectorization Factor Candidates.
7399fe6060f1SDimitry Andric ElementCountSet VFCandidates;
7400fe6060f1SDimitry Andric for (auto VF = ElementCount::getFixed(1);
7401fe6060f1SDimitry Andric ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7402fe6060f1SDimitry Andric VFCandidates.insert(VF);
7403fe6060f1SDimitry Andric for (auto VF = ElementCount::getScalable(1);
7404fe6060f1SDimitry Andric ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7405fe6060f1SDimitry Andric VFCandidates.insert(VF);
74060b57cec5SDimitry Andric
7407c9157d92SDimitry Andric CM.collectInLoopReductions();
7408fe6060f1SDimitry Andric for (const auto &VF : VFCandidates) {
74090b57cec5SDimitry Andric // Collect Uniform and Scalar instructions after vectorization with VF.
74100b57cec5SDimitry Andric CM.collectUniformsAndScalars(VF);
74110b57cec5SDimitry Andric
74120b57cec5SDimitry Andric // Collect the instructions (and their associated costs) that will be more
74130b57cec5SDimitry Andric // profitable to scalarize.
7414e8d8bef9SDimitry Andric if (VF.isVector())
74150b57cec5SDimitry Andric CM.collectInstsToScalarize(VF);
74160b57cec5SDimitry Andric }
74170b57cec5SDimitry Andric
7418fe6060f1SDimitry Andric buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7419fe6060f1SDimitry Andric buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7420e8d8bef9SDimitry Andric
74210b57cec5SDimitry Andric LLVM_DEBUG(printPlans(dbgs()));
7422fe6060f1SDimitry Andric if (!MaxFactors.hasVector())
74230b57cec5SDimitry Andric return VectorizationFactor::Disabled();
74240b57cec5SDimitry Andric
74250b57cec5SDimitry Andric // Select the optimal vectorization factor.
7426fe013be4SDimitry Andric VectorizationFactor VF = selectVectorizationFactor(VFCandidates);
7427753f127fSDimitry Andric assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero.");
7428fe013be4SDimitry Andric if (!hasPlanWithVF(VF.Width)) {
7429fe013be4SDimitry Andric LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width
7430fe013be4SDimitry Andric << ".\n");
7431fe013be4SDimitry Andric return std::nullopt;
7432fe013be4SDimitry Andric }
7433753f127fSDimitry Andric return VF;
74340b57cec5SDimitry Andric }
74350b57cec5SDimitry Andric
getBestPlanFor(ElementCount VF) const7436349cc55cSDimitry Andric VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
7437349cc55cSDimitry Andric assert(count_if(VPlans,
7438349cc55cSDimitry Andric [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
7439349cc55cSDimitry Andric 1 &&
7440349cc55cSDimitry Andric "Best VF has not a single VPlan.");
74410b57cec5SDimitry Andric
7442349cc55cSDimitry Andric for (const VPlanPtr &Plan : VPlans) {
7443349cc55cSDimitry Andric if (Plan->hasVF(VF))
7444349cc55cSDimitry Andric return *Plan.get();
7445349cc55cSDimitry Andric }
7446349cc55cSDimitry Andric llvm_unreachable("No plan found!");
74470b57cec5SDimitry Andric }
74480b57cec5SDimitry Andric
AddRuntimeUnrollDisableMetaData(Loop * L)744904eeddc0SDimitry Andric static void AddRuntimeUnrollDisableMetaData(Loop *L) {
745004eeddc0SDimitry Andric SmallVector<Metadata *, 4> MDs;
745104eeddc0SDimitry Andric // Reserve first location for self reference to the LoopID metadata node.
745204eeddc0SDimitry Andric MDs.push_back(nullptr);
745304eeddc0SDimitry Andric bool IsUnrollMetadata = false;
745404eeddc0SDimitry Andric MDNode *LoopID = L->getLoopID();
745504eeddc0SDimitry Andric if (LoopID) {
745604eeddc0SDimitry Andric // First find existing loop unrolling disable metadata.
745704eeddc0SDimitry Andric for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
745804eeddc0SDimitry Andric auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
745904eeddc0SDimitry Andric if (MD) {
746004eeddc0SDimitry Andric const auto *S = dyn_cast<MDString>(MD->getOperand(0));
746104eeddc0SDimitry Andric IsUnrollMetadata =
7462c9157d92SDimitry Andric S && S->getString().starts_with("llvm.loop.unroll.disable");
746304eeddc0SDimitry Andric }
746404eeddc0SDimitry Andric MDs.push_back(LoopID->getOperand(i));
746504eeddc0SDimitry Andric }
746604eeddc0SDimitry Andric }
746704eeddc0SDimitry Andric
746804eeddc0SDimitry Andric if (!IsUnrollMetadata) {
746904eeddc0SDimitry Andric // Add runtime unroll disable metadata.
747004eeddc0SDimitry Andric LLVMContext &Context = L->getHeader()->getContext();
747104eeddc0SDimitry Andric SmallVector<Metadata *, 1> DisableOperands;
747204eeddc0SDimitry Andric DisableOperands.push_back(
747304eeddc0SDimitry Andric MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
747404eeddc0SDimitry Andric MDNode *DisableNode = MDNode::get(Context, DisableOperands);
747504eeddc0SDimitry Andric MDs.push_back(DisableNode);
747604eeddc0SDimitry Andric MDNode *NewLoopID = MDNode::get(Context, MDs);
747704eeddc0SDimitry Andric // Set operand 0 to refer to the loop id itself.
747804eeddc0SDimitry Andric NewLoopID->replaceOperandWith(0, NewLoopID);
747904eeddc0SDimitry Andric L->setLoopID(NewLoopID);
748004eeddc0SDimitry Andric }
748104eeddc0SDimitry Andric }
748204eeddc0SDimitry Andric
7483cdc20ff6SDimitry Andric // Check if \p RedResult is a ComputeReductionResult instruction, and if it is
7484cdc20ff6SDimitry Andric // create a merge phi node for it and add it to \p ReductionResumeValues.
createAndCollectMergePhiForReduction(VPInstruction * RedResult,DenseMap<const RecurrenceDescriptor *,Value * > & ReductionResumeValues,VPTransformState & State,Loop * OrigLoop,BasicBlock * LoopMiddleBlock)7485cdc20ff6SDimitry Andric static void createAndCollectMergePhiForReduction(
7486cdc20ff6SDimitry Andric VPInstruction *RedResult,
7487cdc20ff6SDimitry Andric DenseMap<const RecurrenceDescriptor *, Value *> &ReductionResumeValues,
7488cdc20ff6SDimitry Andric VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock) {
7489cdc20ff6SDimitry Andric if (!RedResult ||
7490cdc20ff6SDimitry Andric RedResult->getOpcode() != VPInstruction::ComputeReductionResult)
7491cdc20ff6SDimitry Andric return;
7492cdc20ff6SDimitry Andric
7493cdc20ff6SDimitry Andric auto *PhiR = cast<VPReductionPHIRecipe>(RedResult->getOperand(0));
7494cdc20ff6SDimitry Andric const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
7495cdc20ff6SDimitry Andric
7496cdc20ff6SDimitry Andric TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
7497cdc20ff6SDimitry Andric Value *FinalValue =
7498cdc20ff6SDimitry Andric State.get(RedResult, VPIteration(State.UF - 1, VPLane::getFirstLane()));
7499cdc20ff6SDimitry Andric auto *ResumePhi =
7500cdc20ff6SDimitry Andric dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
7501cdc20ff6SDimitry Andric
7502cdc20ff6SDimitry Andric // TODO: bc.merge.rdx should not be created here, instead it should be
7503cdc20ff6SDimitry Andric // modeled in VPlan.
7504cdc20ff6SDimitry Andric BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader();
7505cdc20ff6SDimitry Andric // Create a phi node that merges control-flow from the backedge-taken check
7506cdc20ff6SDimitry Andric // block and the middle block.
7507cdc20ff6SDimitry Andric auto *BCBlockPhi = PHINode::Create(FinalValue->getType(), 2, "bc.merge.rdx",
7508cdc20ff6SDimitry Andric LoopScalarPreHeader->getTerminator());
7509cdc20ff6SDimitry Andric
7510cdc20ff6SDimitry Andric // If we are fixing reductions in the epilogue loop then we should already
7511cdc20ff6SDimitry Andric // have created a bc.merge.rdx Phi after the main vector body. Ensure that
7512cdc20ff6SDimitry Andric // we carry over the incoming values correctly.
7513cdc20ff6SDimitry Andric for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
7514cdc20ff6SDimitry Andric if (Incoming == LoopMiddleBlock)
7515cdc20ff6SDimitry Andric BCBlockPhi->addIncoming(FinalValue, Incoming);
7516cdc20ff6SDimitry Andric else if (ResumePhi && is_contained(ResumePhi->blocks(), Incoming))
7517cdc20ff6SDimitry Andric BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
7518cdc20ff6SDimitry Andric Incoming);
7519cdc20ff6SDimitry Andric else
7520cdc20ff6SDimitry Andric BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
7521cdc20ff6SDimitry Andric }
7522cdc20ff6SDimitry Andric
7523cdc20ff6SDimitry Andric auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
7524cdc20ff6SDimitry Andric // TODO: This fixup should instead be modeled in VPlan.
7525cdc20ff6SDimitry Andric // Fix the scalar loop reduction variable with the incoming reduction sum
7526cdc20ff6SDimitry Andric // from the vector body and from the backedge value.
7527cdc20ff6SDimitry Andric int IncomingEdgeBlockIdx =
7528cdc20ff6SDimitry Andric OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
7529cdc20ff6SDimitry Andric assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
7530cdc20ff6SDimitry Andric // Pick the other block.
7531cdc20ff6SDimitry Andric int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
7532cdc20ff6SDimitry Andric OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
7533cdc20ff6SDimitry Andric Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
7534cdc20ff6SDimitry Andric OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
7535cdc20ff6SDimitry Andric
7536cdc20ff6SDimitry Andric ReductionResumeValues[&RdxDesc] = BCBlockPhi;
7537cdc20ff6SDimitry Andric }
7538cdc20ff6SDimitry Andric
7539cdc20ff6SDimitry Andric std::pair<DenseMap<const SCEV *, Value *>,
7540cdc20ff6SDimitry Andric DenseMap<const RecurrenceDescriptor *, Value *>>
executePlan(ElementCount BestVF,unsigned BestUF,VPlan & BestVPlan,InnerLoopVectorizer & ILV,DominatorTree * DT,bool IsEpilogueVectorization,const DenseMap<const SCEV *,Value * > * ExpandedSCEVs)7541cdc20ff6SDimitry Andric LoopVectorizationPlanner::executePlan(
7542fe013be4SDimitry Andric ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7543fe013be4SDimitry Andric InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization,
7544c9157d92SDimitry Andric const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {
7545bdd1243dSDimitry Andric assert(BestVPlan.hasVF(BestVF) &&
7546bdd1243dSDimitry Andric "Trying to execute plan with unsupported VF");
7547bdd1243dSDimitry Andric assert(BestVPlan.hasUF(BestUF) &&
7548bdd1243dSDimitry Andric "Trying to execute plan with unsupported UF");
7549fe013be4SDimitry Andric assert(
7550fe013be4SDimitry Andric (IsEpilogueVectorization || !ExpandedSCEVs) &&
7551fe013be4SDimitry Andric "expanded SCEVs to reuse can only be used during epilogue vectorization");
7552bdd1243dSDimitry Andric
7553349cc55cSDimitry Andric LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
7554349cc55cSDimitry Andric << '\n');
7555349cc55cSDimitry Andric
7556bdd1243dSDimitry Andric if (!IsEpilogueVectorization)
7557bdd1243dSDimitry Andric VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
7558bdd1243dSDimitry Andric
75590b57cec5SDimitry Andric // Perform the actual loop transformation.
7560c9157d92SDimitry Andric VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan,
7561c9157d92SDimitry Andric OrigLoop->getHeader()->getContext());
7562fe013be4SDimitry Andric
7563fe013be4SDimitry Andric // 0. Generate SCEV-dependent code into the preheader, including TripCount,
7564fe013be4SDimitry Andric // before making any changes to the CFG.
7565fe013be4SDimitry Andric if (!BestVPlan.getPreheader()->empty()) {
7566fe013be4SDimitry Andric State.CFG.PrevBB = OrigLoop->getLoopPreheader();
7567fe013be4SDimitry Andric State.Builder.SetInsertPoint(OrigLoop->getLoopPreheader()->getTerminator());
7568fe013be4SDimitry Andric BestVPlan.getPreheader()->execute(&State);
7569fe013be4SDimitry Andric }
7570fe013be4SDimitry Andric if (!ILV.getTripCount())
7571fe013be4SDimitry Andric ILV.setTripCount(State.get(BestVPlan.getTripCount(), {0, 0}));
7572fe013be4SDimitry Andric else
7573fe013be4SDimitry Andric assert(IsEpilogueVectorization && "should only re-use the existing trip "
7574fe013be4SDimitry Andric "count during epilogue vectorization");
75750b57cec5SDimitry Andric
757681ad6265SDimitry Andric // 1. Set up the skeleton for vectorization, including vector pre-header and
757781ad6265SDimitry Andric // middle block. The vector loop is created during VPlan execution.
757804eeddc0SDimitry Andric Value *CanonicalIVStartValue;
757904eeddc0SDimitry Andric std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
7580fe013be4SDimitry Andric ILV.createVectorizedLoopSkeleton(ExpandedSCEVs ? *ExpandedSCEVs
7581fe013be4SDimitry Andric : State.ExpandedSCEVs);
758281ad6265SDimitry Andric
758381ad6265SDimitry Andric // Only use noalias metadata when using memory checks guaranteeing no overlap
758481ad6265SDimitry Andric // across all iterations.
758581ad6265SDimitry Andric const LoopAccessInfo *LAI = ILV.Legal->getLAI();
7586fe013be4SDimitry Andric std::unique_ptr<LoopVersioning> LVer = nullptr;
758781ad6265SDimitry Andric if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
758881ad6265SDimitry Andric !LAI->getRuntimePointerChecking()->getDiffChecks()) {
758981ad6265SDimitry Andric
759081ad6265SDimitry Andric // We currently don't use LoopVersioning for the actual loop cloning but we
759181ad6265SDimitry Andric // still use it to add the noalias metadata.
759281ad6265SDimitry Andric // TODO: Find a better way to re-use LoopVersioning functionality to add
759381ad6265SDimitry Andric // metadata.
7594fe013be4SDimitry Andric LVer = std::make_unique<LoopVersioning>(
759581ad6265SDimitry Andric *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT,
759681ad6265SDimitry Andric PSE.getSE());
7597fe013be4SDimitry Andric State.LVer = &*LVer;
759881ad6265SDimitry Andric State.LVer->prepareNoAliasMetadata();
759981ad6265SDimitry Andric }
760081ad6265SDimitry Andric
76014824e7fdSDimitry Andric ILV.collectPoisonGeneratingRecipes(State);
76020b57cec5SDimitry Andric
7603e8d8bef9SDimitry Andric ILV.printDebugTracesAtStart();
7604e8d8bef9SDimitry Andric
76050b57cec5SDimitry Andric //===------------------------------------------------===//
76060b57cec5SDimitry Andric //
76070b57cec5SDimitry Andric // Notice: any optimization or new instruction that go
76080b57cec5SDimitry Andric // into the code below should also be implemented in
76090b57cec5SDimitry Andric // the cost-model.
76100b57cec5SDimitry Andric //
76110b57cec5SDimitry Andric //===------------------------------------------------===//
76120b57cec5SDimitry Andric
76130b57cec5SDimitry Andric // 2. Copy and widen instructions from the old loop into the new loop.
7614c9157d92SDimitry Andric BestVPlan.prepareToExecute(ILV.getTripCount(),
7615c9157d92SDimitry Andric ILV.getOrCreateVectorTripCount(nullptr),
7616c9157d92SDimitry Andric CanonicalIVStartValue, State);
761781ad6265SDimitry Andric
7618349cc55cSDimitry Andric BestVPlan.execute(&State);
76190b57cec5SDimitry Andric
7620cdc20ff6SDimitry Andric // 2.5 Collect reduction resume values.
7621cdc20ff6SDimitry Andric DenseMap<const RecurrenceDescriptor *, Value *> ReductionResumeValues;
7622cdc20ff6SDimitry Andric auto *ExitVPBB =
7623cdc20ff6SDimitry Andric cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
7624cdc20ff6SDimitry Andric for (VPRecipeBase &R : *ExitVPBB) {
7625cdc20ff6SDimitry Andric createAndCollectMergePhiForReduction(dyn_cast<VPInstruction>(&R),
7626cdc20ff6SDimitry Andric ReductionResumeValues, State, OrigLoop,
7627cdc20ff6SDimitry Andric State.CFG.VPBB2IRBB[ExitVPBB]);
7628cdc20ff6SDimitry Andric }
7629cdc20ff6SDimitry Andric
7630cdc20ff6SDimitry Andric // 2.6. Maintain Loop Hints
763104eeddc0SDimitry Andric // Keep all loop hints from the original loop on the vector loop (we'll
763204eeddc0SDimitry Andric // replace the vectorizer-specific hints below).
763304eeddc0SDimitry Andric MDNode *OrigLoopID = OrigLoop->getLoopID();
763404eeddc0SDimitry Andric
7635bdd1243dSDimitry Andric std::optional<MDNode *> VectorizedLoopID =
763604eeddc0SDimitry Andric makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
763704eeddc0SDimitry Andric LLVMLoopVectorizeFollowupVectorized});
763804eeddc0SDimitry Andric
763981ad6265SDimitry Andric VPBasicBlock *HeaderVPBB =
764081ad6265SDimitry Andric BestVPlan.getVectorLoopRegion()->getEntryBasicBlock();
764181ad6265SDimitry Andric Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
764281ad6265SDimitry Andric if (VectorizedLoopID)
7643bdd1243dSDimitry Andric L->setLoopID(*VectorizedLoopID);
764404eeddc0SDimitry Andric else {
764504eeddc0SDimitry Andric // Keep all loop hints from the original loop on the vector loop (we'll
764604eeddc0SDimitry Andric // replace the vectorizer-specific hints below).
764704eeddc0SDimitry Andric if (MDNode *LID = OrigLoop->getLoopID())
764804eeddc0SDimitry Andric L->setLoopID(LID);
764904eeddc0SDimitry Andric
765004eeddc0SDimitry Andric LoopVectorizeHints Hints(L, true, *ORE);
765104eeddc0SDimitry Andric Hints.setAlreadyVectorized();
765204eeddc0SDimitry Andric }
7653fe013be4SDimitry Andric TargetTransformInfo::UnrollingPreferences UP;
7654fe013be4SDimitry Andric TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
7655fe013be4SDimitry Andric if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue)
765604eeddc0SDimitry Andric AddRuntimeUnrollDisableMetaData(L);
765704eeddc0SDimitry Andric
76580b57cec5SDimitry Andric // 3. Fix the vectorized code: take care of header phi's, live-outs,
76590b57cec5SDimitry Andric // predication, updating analyses.
766081ad6265SDimitry Andric ILV.fixVectorizedLoop(State, BestVPlan);
7661e8d8bef9SDimitry Andric
7662e8d8bef9SDimitry Andric ILV.printDebugTracesAtEnd();
7663fe013be4SDimitry Andric
7664cdc20ff6SDimitry Andric return {State.ExpandedSCEVs, ReductionResumeValues};
76650b57cec5SDimitry Andric }
76660b57cec5SDimitry Andric
7667fe6060f1SDimitry Andric #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
printPlans(raw_ostream & O)7668fe6060f1SDimitry Andric void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
7669fe6060f1SDimitry Andric for (const auto &Plan : VPlans)
7670fe6060f1SDimitry Andric if (PrintVPlansInDotFormat)
7671fe6060f1SDimitry Andric Plan->printDOT(O);
7672fe6060f1SDimitry Andric else
7673fe6060f1SDimitry Andric Plan->print(O);
7674fe6060f1SDimitry Andric }
7675fe6060f1SDimitry Andric #endif
7676fe6060f1SDimitry Andric
7677e8d8bef9SDimitry Andric //===--------------------------------------------------------------------===//
7678e8d8bef9SDimitry Andric // EpilogueVectorizerMainLoop
7679e8d8bef9SDimitry Andric //===--------------------------------------------------------------------===//
7680e8d8bef9SDimitry Andric
7681e8d8bef9SDimitry Andric /// This function is partially responsible for generating the control flow
7682e8d8bef9SDimitry Andric /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
768304eeddc0SDimitry Andric std::pair<BasicBlock *, Value *>
createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy & ExpandedSCEVs)7684fe013be4SDimitry Andric EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
7685fe013be4SDimitry Andric const SCEV2ValueTy &ExpandedSCEVs) {
768681ad6265SDimitry Andric createVectorLoopSkeleton("");
7687e8d8bef9SDimitry Andric
7688e8d8bef9SDimitry Andric // Generate the code to check the minimum iteration count of the vector
7689e8d8bef9SDimitry Andric // epilogue (see below).
7690e8d8bef9SDimitry Andric EPI.EpilogueIterationCountCheck =
769181ad6265SDimitry Andric emitIterationCountCheck(LoopScalarPreHeader, true);
7692e8d8bef9SDimitry Andric EPI.EpilogueIterationCountCheck->setName("iter.check");
7693e8d8bef9SDimitry Andric
7694e8d8bef9SDimitry Andric // Generate the code to check any assumptions that we've made for SCEV
7695e8d8bef9SDimitry Andric // expressions.
769681ad6265SDimitry Andric EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader);
7697e8d8bef9SDimitry Andric
7698e8d8bef9SDimitry Andric // Generate the code that checks at runtime if arrays overlap. We put the
7699e8d8bef9SDimitry Andric // checks into a separate block to make the more common case of few elements
7700e8d8bef9SDimitry Andric // faster.
770181ad6265SDimitry Andric EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader);
7702e8d8bef9SDimitry Andric
7703e8d8bef9SDimitry Andric // Generate the iteration count check for the main loop, *after* the check
7704e8d8bef9SDimitry Andric // for the epilogue loop, so that the path-length is shorter for the case
7705e8d8bef9SDimitry Andric // that goes directly through the vector epilogue. The longer-path length for
7706e8d8bef9SDimitry Andric // the main loop is compensated for, by the gain from vectorizing the larger
7707e8d8bef9SDimitry Andric // trip count. Note: the branch will get updated later on when we vectorize
7708e8d8bef9SDimitry Andric // the epilogue.
7709e8d8bef9SDimitry Andric EPI.MainLoopIterationCountCheck =
771081ad6265SDimitry Andric emitIterationCountCheck(LoopScalarPreHeader, false);
7711e8d8bef9SDimitry Andric
7712e8d8bef9SDimitry Andric // Generate the induction variable.
771381ad6265SDimitry Andric EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
7714e8d8bef9SDimitry Andric
7715e8d8bef9SDimitry Andric // Skip induction resume value creation here because they will be created in
7716bdd1243dSDimitry Andric // the second pass for the scalar loop. The induction resume values for the
7717bdd1243dSDimitry Andric // inductions in the epilogue loop are created before executing the plan for
7718bdd1243dSDimitry Andric // the epilogue loop.
7719e8d8bef9SDimitry Andric
7720bdd1243dSDimitry Andric return {completeLoopSkeleton(), nullptr};
7721e8d8bef9SDimitry Andric }
7722e8d8bef9SDimitry Andric
printDebugTracesAtStart()7723e8d8bef9SDimitry Andric void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7724e8d8bef9SDimitry Andric LLVM_DEBUG({
7725e8d8bef9SDimitry Andric dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7726349cc55cSDimitry Andric << "Main Loop VF:" << EPI.MainLoopVF
7727e8d8bef9SDimitry Andric << ", Main Loop UF:" << EPI.MainLoopUF
7728349cc55cSDimitry Andric << ", Epilogue Loop VF:" << EPI.EpilogueVF
7729e8d8bef9SDimitry Andric << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7730e8d8bef9SDimitry Andric });
7731e8d8bef9SDimitry Andric }
7732e8d8bef9SDimitry Andric
printDebugTracesAtEnd()7733e8d8bef9SDimitry Andric void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7734e8d8bef9SDimitry Andric DEBUG_WITH_TYPE(VerboseDebug, {
77354824e7fdSDimitry Andric dbgs() << "intermediate fn:\n"
77364824e7fdSDimitry Andric << *OrigLoop->getHeader()->getParent() << "\n";
7737e8d8bef9SDimitry Andric });
7738e8d8bef9SDimitry Andric }
7739e8d8bef9SDimitry Andric
774081ad6265SDimitry Andric BasicBlock *
emitIterationCountCheck(BasicBlock * Bypass,bool ForEpilogue)774181ad6265SDimitry Andric EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
774281ad6265SDimitry Andric bool ForEpilogue) {
7743e8d8bef9SDimitry Andric assert(Bypass && "Expected valid bypass basic block.");
7744349cc55cSDimitry Andric ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7745e8d8bef9SDimitry Andric unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7746fe013be4SDimitry Andric Value *Count = getTripCount();
7747e8d8bef9SDimitry Andric // Reuse existing vector loop preheader for TC checks.
7748e8d8bef9SDimitry Andric // Note that new preheader block is generated for vector loop.
7749e8d8bef9SDimitry Andric BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7750e8d8bef9SDimitry Andric IRBuilder<> Builder(TCCheckBlock->getTerminator());
7751e8d8bef9SDimitry Andric
7752e8d8bef9SDimitry Andric // Generate code to check if the loop's trip count is less than VF * UF of the
7753e8d8bef9SDimitry Andric // main vector loop.
7754fe013be4SDimitry Andric auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector()
7755fe013be4SDimitry Andric : VF.isVector())
7756fe013be4SDimitry Andric ? ICmpInst::ICMP_ULE
7757fe013be4SDimitry Andric : ICmpInst::ICMP_ULT;
7758e8d8bef9SDimitry Andric
7759e8d8bef9SDimitry Andric Value *CheckMinIters = Builder.CreateICmp(
7760349cc55cSDimitry Andric P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
7761e8d8bef9SDimitry Andric "min.iters.check");
7762e8d8bef9SDimitry Andric
7763e8d8bef9SDimitry Andric if (!ForEpilogue)
7764e8d8bef9SDimitry Andric TCCheckBlock->setName("vector.main.loop.iter.check");
7765e8d8bef9SDimitry Andric
7766e8d8bef9SDimitry Andric // Create new preheader for vector loop.
7767e8d8bef9SDimitry Andric LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7768e8d8bef9SDimitry Andric DT, LI, nullptr, "vector.ph");
7769e8d8bef9SDimitry Andric
7770e8d8bef9SDimitry Andric if (ForEpilogue) {
7771e8d8bef9SDimitry Andric assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7772e8d8bef9SDimitry Andric DT->getNode(Bypass)->getIDom()) &&
7773e8d8bef9SDimitry Andric "TC check is expected to dominate Bypass");
7774e8d8bef9SDimitry Andric
7775e8d8bef9SDimitry Andric // Update dominator for Bypass & LoopExit.
7776e8d8bef9SDimitry Andric DT->changeImmediateDominator(Bypass, TCCheckBlock);
7777fe013be4SDimitry Andric if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))
7778fe6060f1SDimitry Andric // For loops with multiple exits, there's no edge from the middle block
7779fe6060f1SDimitry Andric // to exit blocks (as the epilogue must run) and thus no need to update
7780fe6060f1SDimitry Andric // the immediate dominator of the exit blocks.
7781e8d8bef9SDimitry Andric DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
7782e8d8bef9SDimitry Andric
7783e8d8bef9SDimitry Andric LoopBypassBlocks.push_back(TCCheckBlock);
7784e8d8bef9SDimitry Andric
7785e8d8bef9SDimitry Andric // Save the trip count so we don't have to regenerate it in the
7786e8d8bef9SDimitry Andric // vec.epilog.iter.check. This is safe to do because the trip count
7787e8d8bef9SDimitry Andric // generated here dominates the vector epilog iter check.
7788e8d8bef9SDimitry Andric EPI.TripCount = Count;
7789e8d8bef9SDimitry Andric }
7790e8d8bef9SDimitry Andric
7791c9157d92SDimitry Andric BranchInst &BI =
7792c9157d92SDimitry Andric *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7793c9157d92SDimitry Andric if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
7794c9157d92SDimitry Andric setBranchWeights(BI, MinItersBypassWeights);
7795c9157d92SDimitry Andric ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
7796e8d8bef9SDimitry Andric
7797e8d8bef9SDimitry Andric return TCCheckBlock;
7798e8d8bef9SDimitry Andric }
7799e8d8bef9SDimitry Andric
7800e8d8bef9SDimitry Andric //===--------------------------------------------------------------------===//
7801e8d8bef9SDimitry Andric // EpilogueVectorizerEpilogueLoop
7802e8d8bef9SDimitry Andric //===--------------------------------------------------------------------===//
7803e8d8bef9SDimitry Andric
7804e8d8bef9SDimitry Andric /// This function is partially responsible for generating the control flow
7805e8d8bef9SDimitry Andric /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
780604eeddc0SDimitry Andric std::pair<BasicBlock *, Value *>
createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy & ExpandedSCEVs)7807fe013be4SDimitry Andric EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
7808fe013be4SDimitry Andric const SCEV2ValueTy &ExpandedSCEVs) {
780981ad6265SDimitry Andric createVectorLoopSkeleton("vec.epilog.");
7810e8d8bef9SDimitry Andric
7811e8d8bef9SDimitry Andric // Now, compare the remaining count and if there aren't enough iterations to
7812e8d8bef9SDimitry Andric // execute the vectorized epilogue skip to the scalar part.
7813e8d8bef9SDimitry Andric BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
7814e8d8bef9SDimitry Andric VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
7815e8d8bef9SDimitry Andric LoopVectorPreHeader =
7816e8d8bef9SDimitry Andric SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
7817e8d8bef9SDimitry Andric LI, nullptr, "vec.epilog.ph");
781881ad6265SDimitry Andric emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
7819e8d8bef9SDimitry Andric VecEpilogueIterationCountCheck);
7820e8d8bef9SDimitry Andric
7821e8d8bef9SDimitry Andric // Adjust the control flow taking the state info from the main loop
7822e8d8bef9SDimitry Andric // vectorization into account.
7823e8d8bef9SDimitry Andric assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7824e8d8bef9SDimitry Andric "expected this to be saved from the previous pass.");
7825e8d8bef9SDimitry Andric EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7826e8d8bef9SDimitry Andric VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7827e8d8bef9SDimitry Andric
7828e8d8bef9SDimitry Andric DT->changeImmediateDominator(LoopVectorPreHeader,
7829e8d8bef9SDimitry Andric EPI.MainLoopIterationCountCheck);
7830e8d8bef9SDimitry Andric
7831e8d8bef9SDimitry Andric EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7832e8d8bef9SDimitry Andric VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7833e8d8bef9SDimitry Andric
7834e8d8bef9SDimitry Andric if (EPI.SCEVSafetyCheck)
7835e8d8bef9SDimitry Andric EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
7836e8d8bef9SDimitry Andric VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7837e8d8bef9SDimitry Andric if (EPI.MemSafetyCheck)
7838e8d8bef9SDimitry Andric EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
7839e8d8bef9SDimitry Andric VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7840e8d8bef9SDimitry Andric
7841e8d8bef9SDimitry Andric DT->changeImmediateDominator(
7842e8d8bef9SDimitry Andric VecEpilogueIterationCountCheck,
7843e8d8bef9SDimitry Andric VecEpilogueIterationCountCheck->getSinglePredecessor());
7844e8d8bef9SDimitry Andric
7845e8d8bef9SDimitry Andric DT->changeImmediateDominator(LoopScalarPreHeader,
7846e8d8bef9SDimitry Andric EPI.EpilogueIterationCountCheck);
7847fe013be4SDimitry Andric if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))
7848fe6060f1SDimitry Andric // If there is an epilogue which must run, there's no edge from the
7849fe6060f1SDimitry Andric // middle block to exit blocks and thus no need to update the immediate
7850fe6060f1SDimitry Andric // dominator of the exit blocks.
7851fe6060f1SDimitry Andric DT->changeImmediateDominator(LoopExitBlock,
7852fe6060f1SDimitry Andric EPI.EpilogueIterationCountCheck);
7853e8d8bef9SDimitry Andric
7854bdd1243dSDimitry Andric // Keep track of bypass blocks, as they feed start values to the induction and
7855bdd1243dSDimitry Andric // reduction phis in the scalar loop preheader.
7856e8d8bef9SDimitry Andric if (EPI.SCEVSafetyCheck)
7857e8d8bef9SDimitry Andric LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
7858e8d8bef9SDimitry Andric if (EPI.MemSafetyCheck)
7859e8d8bef9SDimitry Andric LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
7860e8d8bef9SDimitry Andric LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
7861e8d8bef9SDimitry Andric
7862bdd1243dSDimitry Andric // The vec.epilog.iter.check block may contain Phi nodes from inductions or
7863bdd1243dSDimitry Andric // reductions which merge control-flow from the latch block and the middle
7864bdd1243dSDimitry Andric // block. Update the incoming values here and move the Phi into the preheader.
786504eeddc0SDimitry Andric SmallVector<PHINode *, 4> PhisInBlock;
786604eeddc0SDimitry Andric for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
786704eeddc0SDimitry Andric PhisInBlock.push_back(&Phi);
786804eeddc0SDimitry Andric
786904eeddc0SDimitry Andric for (PHINode *Phi : PhisInBlock) {
7870bdd1243dSDimitry Andric Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
787104eeddc0SDimitry Andric Phi->replaceIncomingBlockWith(
787204eeddc0SDimitry Andric VecEpilogueIterationCountCheck->getSinglePredecessor(),
787304eeddc0SDimitry Andric VecEpilogueIterationCountCheck);
7874bdd1243dSDimitry Andric
7875bdd1243dSDimitry Andric // If the phi doesn't have an incoming value from the
7876bdd1243dSDimitry Andric // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
7877bdd1243dSDimitry Andric // value and also those from other check blocks. This is needed for
7878bdd1243dSDimitry Andric // reduction phis only.
7879bdd1243dSDimitry Andric if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
7880bdd1243dSDimitry Andric return EPI.EpilogueIterationCountCheck == IncB;
7881bdd1243dSDimitry Andric }))
7882bdd1243dSDimitry Andric continue;
788304eeddc0SDimitry Andric Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
788404eeddc0SDimitry Andric if (EPI.SCEVSafetyCheck)
788504eeddc0SDimitry Andric Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
788604eeddc0SDimitry Andric if (EPI.MemSafetyCheck)
788704eeddc0SDimitry Andric Phi->removeIncomingValue(EPI.MemSafetyCheck);
788804eeddc0SDimitry Andric }
788904eeddc0SDimitry Andric
7890e8d8bef9SDimitry Andric // Generate a resume induction for the vector epilogue and put it in the
7891e8d8bef9SDimitry Andric // vector epilogue preheader
7892e8d8bef9SDimitry Andric Type *IdxTy = Legal->getWidestInductionType();
7893c9157d92SDimitry Andric PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val");
7894c9157d92SDimitry Andric EPResumeVal->insertBefore(LoopVectorPreHeader->getFirstNonPHIIt());
7895e8d8bef9SDimitry Andric EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
7896e8d8bef9SDimitry Andric EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
7897e8d8bef9SDimitry Andric EPI.MainLoopIterationCountCheck);
7898e8d8bef9SDimitry Andric
7899e8d8bef9SDimitry Andric // Generate induction resume values. These variables save the new starting
7900e8d8bef9SDimitry Andric // indexes for the scalar loop. They are used to test if there are any tail
7901e8d8bef9SDimitry Andric // iterations left once the vector loop has completed.
7902e8d8bef9SDimitry Andric // Note that when the vectorized epilogue is skipped due to iteration count
7903e8d8bef9SDimitry Andric // check, then the resume value for the induction variable comes from
7904e8d8bef9SDimitry Andric // the trip count of the main vector loop, hence passing the AdditionalBypass
7905e8d8bef9SDimitry Andric // argument.
7906fe013be4SDimitry Andric createInductionResumeValues(ExpandedSCEVs,
7907fe013be4SDimitry Andric {VecEpilogueIterationCountCheck,
7908e8d8bef9SDimitry Andric EPI.VectorTripCount} /* AdditionalBypass */);
7909e8d8bef9SDimitry Andric
7910bdd1243dSDimitry Andric return {completeLoopSkeleton(), EPResumeVal};
7911e8d8bef9SDimitry Andric }
7912e8d8bef9SDimitry Andric
7913e8d8bef9SDimitry Andric BasicBlock *
emitMinimumVectorEpilogueIterCountCheck(BasicBlock * Bypass,BasicBlock * Insert)7914e8d8bef9SDimitry Andric EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
791581ad6265SDimitry Andric BasicBlock *Bypass, BasicBlock *Insert) {
7916e8d8bef9SDimitry Andric
7917e8d8bef9SDimitry Andric assert(EPI.TripCount &&
7918e8d8bef9SDimitry Andric "Expected trip count to have been safed in the first pass.");
7919e8d8bef9SDimitry Andric assert(
7920e8d8bef9SDimitry Andric (!isa<Instruction>(EPI.TripCount) ||
7921e8d8bef9SDimitry Andric DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
7922e8d8bef9SDimitry Andric "saved trip count does not dominate insertion point.");
7923e8d8bef9SDimitry Andric Value *TC = EPI.TripCount;
7924e8d8bef9SDimitry Andric IRBuilder<> Builder(Insert->getTerminator());
7925e8d8bef9SDimitry Andric Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
7926e8d8bef9SDimitry Andric
7927e8d8bef9SDimitry Andric // Generate code to check if the loop's trip count is less than VF * UF of the
7928e8d8bef9SDimitry Andric // vector epilogue loop.
7929fe013be4SDimitry Andric auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())
7930fe013be4SDimitry Andric ? ICmpInst::ICMP_ULE
7931fe013be4SDimitry Andric : ICmpInst::ICMP_ULT;
7932e8d8bef9SDimitry Andric
7933349cc55cSDimitry Andric Value *CheckMinIters =
7934349cc55cSDimitry Andric Builder.CreateICmp(P, Count,
7935349cc55cSDimitry Andric createStepForVF(Builder, Count->getType(),
7936349cc55cSDimitry Andric EPI.EpilogueVF, EPI.EpilogueUF),
7937e8d8bef9SDimitry Andric "min.epilog.iters.check");
7938e8d8bef9SDimitry Andric
7939c9157d92SDimitry Andric BranchInst &BI =
7940c9157d92SDimitry Andric *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7941c9157d92SDimitry Andric if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
7942c9157d92SDimitry Andric unsigned MainLoopStep = UF * VF.getKnownMinValue();
7943c9157d92SDimitry Andric unsigned EpilogueLoopStep =
7944c9157d92SDimitry Andric EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue();
7945c9157d92SDimitry Andric // We assume the remaining `Count` is equally distributed in
7946c9157d92SDimitry Andric // [0, MainLoopStep)
7947c9157d92SDimitry Andric // So the probability for `Count < EpilogueLoopStep` should be
7948c9157d92SDimitry Andric // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
7949c9157d92SDimitry Andric unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
7950c9157d92SDimitry Andric const uint32_t Weights[] = {EstimatedSkipCount,
7951c9157d92SDimitry Andric MainLoopStep - EstimatedSkipCount};
7952c9157d92SDimitry Andric setBranchWeights(BI, Weights);
7953c9157d92SDimitry Andric }
7954c9157d92SDimitry Andric ReplaceInstWithInst(Insert->getTerminator(), &BI);
7955e8d8bef9SDimitry Andric
7956e8d8bef9SDimitry Andric LoopBypassBlocks.push_back(Insert);
7957e8d8bef9SDimitry Andric return Insert;
7958e8d8bef9SDimitry Andric }
7959e8d8bef9SDimitry Andric
printDebugTracesAtStart()7960e8d8bef9SDimitry Andric void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
7961e8d8bef9SDimitry Andric LLVM_DEBUG({
7962e8d8bef9SDimitry Andric dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7963349cc55cSDimitry Andric << "Epilogue Loop VF:" << EPI.EpilogueVF
7964e8d8bef9SDimitry Andric << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7965e8d8bef9SDimitry Andric });
7966e8d8bef9SDimitry Andric }
7967e8d8bef9SDimitry Andric
printDebugTracesAtEnd()7968e8d8bef9SDimitry Andric void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
7969e8d8bef9SDimitry Andric DEBUG_WITH_TYPE(VerboseDebug, {
79704824e7fdSDimitry Andric dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
7971e8d8bef9SDimitry Andric });
7972e8d8bef9SDimitry Andric }
7973e8d8bef9SDimitry Andric
getDecisionAndClampRange(const std::function<bool (ElementCount)> & Predicate,VFRange & Range)79740b57cec5SDimitry Andric bool LoopVectorizationPlanner::getDecisionAndClampRange(
7975e8d8bef9SDimitry Andric const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
7976e8d8bef9SDimitry Andric assert(!Range.isEmpty() && "Trying to test an empty VF range.");
79770b57cec5SDimitry Andric bool PredicateAtRangeStart = Predicate(Range.Start);
79780b57cec5SDimitry Andric
7979fe013be4SDimitry Andric for (ElementCount TmpVF : VFRange(Range.Start * 2, Range.End))
79800b57cec5SDimitry Andric if (Predicate(TmpVF) != PredicateAtRangeStart) {
79810b57cec5SDimitry Andric Range.End = TmpVF;
79820b57cec5SDimitry Andric break;
79830b57cec5SDimitry Andric }
79840b57cec5SDimitry Andric
79850b57cec5SDimitry Andric return PredicateAtRangeStart;
79860b57cec5SDimitry Andric }
79870b57cec5SDimitry Andric
79880b57cec5SDimitry Andric /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
79890b57cec5SDimitry Andric /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
79900b57cec5SDimitry Andric /// of VF's starting at a given VF and extending it as much as possible. Each
79910b57cec5SDimitry Andric /// vectorization decision can potentially shorten this sub-range during
79920b57cec5SDimitry Andric /// buildVPlan().
buildVPlans(ElementCount MinVF,ElementCount MaxVF)7993e8d8bef9SDimitry Andric void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
7994e8d8bef9SDimitry Andric ElementCount MaxVF) {
7995fe013be4SDimitry Andric auto MaxVFTimes2 = MaxVF * 2;
7996fe013be4SDimitry Andric for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
7997fe013be4SDimitry Andric VFRange SubRange = {VF, MaxVFTimes2};
79980b57cec5SDimitry Andric VPlans.push_back(buildVPlan(SubRange));
79990b57cec5SDimitry Andric VF = SubRange.End;
80000b57cec5SDimitry Andric }
80010b57cec5SDimitry Andric }
80020b57cec5SDimitry Andric
createEdgeMask(BasicBlock * Src,BasicBlock * Dst,VPlan & Plan)80030b57cec5SDimitry Andric VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
8004fe013be4SDimitry Andric VPlan &Plan) {
80050b57cec5SDimitry Andric assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
80060b57cec5SDimitry Andric
80070b57cec5SDimitry Andric // Look for cached value.
80080b57cec5SDimitry Andric std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
80090b57cec5SDimitry Andric EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
80100b57cec5SDimitry Andric if (ECEntryIt != EdgeMaskCache.end())
80110b57cec5SDimitry Andric return ECEntryIt->second;
80120b57cec5SDimitry Andric
8013cdc20ff6SDimitry Andric VPValue *SrcMask = getBlockInMask(Src);
80140b57cec5SDimitry Andric
80150b57cec5SDimitry Andric // The terminator has to be a branch inst!
80160b57cec5SDimitry Andric BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
80170b57cec5SDimitry Andric assert(BI && "Unexpected terminator found");
80180b57cec5SDimitry Andric
801913138422SDimitry Andric if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
80200b57cec5SDimitry Andric return EdgeMaskCache[Edge] = SrcMask;
80210b57cec5SDimitry Andric
8022e8d8bef9SDimitry Andric // If source is an exiting block, we know the exit edge is dynamically dead
8023e8d8bef9SDimitry Andric // in the vector loop, and thus we don't need to restrict the mask. Avoid
8024e8d8bef9SDimitry Andric // adding uses of an otherwise potentially dead instruction.
8025e8d8bef9SDimitry Andric if (OrigLoop->isLoopExiting(Src))
8026e8d8bef9SDimitry Andric return EdgeMaskCache[Edge] = SrcMask;
8027e8d8bef9SDimitry Andric
8028fe013be4SDimitry Andric VPValue *EdgeMask = Plan.getVPValueOrAddLiveIn(BI->getCondition());
80290b57cec5SDimitry Andric assert(EdgeMask && "No Edge Mask found for condition");
80300b57cec5SDimitry Andric
80310b57cec5SDimitry Andric if (BI->getSuccessor(0) != Dst)
80320eae32dcSDimitry Andric EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
80330b57cec5SDimitry Andric
8034d409305fSDimitry Andric if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8035d409305fSDimitry Andric // The condition is 'SrcMask && EdgeMask', which is equivalent to
8036d409305fSDimitry Andric // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8037d409305fSDimitry Andric // The select version does not introduce new UB if SrcMask is false and
8038d409305fSDimitry Andric // EdgeMask is poison. Using 'and' here introduces undefined behavior.
8039fe013be4SDimitry Andric VPValue *False = Plan.getVPValueOrAddLiveIn(
8040d409305fSDimitry Andric ConstantInt::getFalse(BI->getCondition()->getType()));
80410eae32dcSDimitry Andric EdgeMask =
80420eae32dcSDimitry Andric Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc());
8043d409305fSDimitry Andric }
80440b57cec5SDimitry Andric
80450b57cec5SDimitry Andric return EdgeMaskCache[Edge] = EdgeMask;
80460b57cec5SDimitry Andric }
80470b57cec5SDimitry Andric
createHeaderMask(VPlan & Plan)8048c9157d92SDimitry Andric void VPRecipeBuilder::createHeaderMask(VPlan &Plan) {
8049c9157d92SDimitry Andric BasicBlock *Header = OrigLoop->getHeader();
80500b57cec5SDimitry Andric
8051c9157d92SDimitry Andric // When not folding the tail, use nullptr to model all-true mask.
8052c9157d92SDimitry Andric if (!CM.foldTailByMasking()) {
8053c9157d92SDimitry Andric BlockMaskCache[Header] = nullptr;
8054c9157d92SDimitry Andric return;
8055c9157d92SDimitry Andric }
8056753f127fSDimitry Andric
80570eae32dcSDimitry Andric // Introduce the early-exit compare IV <= BTC to form header block mask.
805804eeddc0SDimitry Andric // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
805904eeddc0SDimitry Andric // constructing the desired canonical IV in the header block as its first
806004eeddc0SDimitry Andric // non-phi instructions.
8061753f127fSDimitry Andric
8062fe013be4SDimitry Andric VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
806304eeddc0SDimitry Andric auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8064fe013be4SDimitry Andric auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
8065c9157d92SDimitry Andric HeaderVPBB->insert(IV, NewInsertionPoint);
80660eae32dcSDimitry Andric
8067e8d8bef9SDimitry Andric VPBuilder::InsertPointGuard Guard(Builder);
806804eeddc0SDimitry Andric Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
8069c9157d92SDimitry Andric VPValue *BlockMask = nullptr;
8070fe013be4SDimitry Andric VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
8071c9157d92SDimitry Andric BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
8072c9157d92SDimitry Andric BlockMaskCache[Header] = BlockMask;
80730b57cec5SDimitry Andric }
80740b57cec5SDimitry Andric
getBlockInMask(BasicBlock * BB) const8075cdc20ff6SDimitry Andric VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const {
8076cdc20ff6SDimitry Andric // Return the cached value.
8077cdc20ff6SDimitry Andric BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB);
8078cdc20ff6SDimitry Andric assert(BCEntryIt != BlockMaskCache.end() &&
8079cdc20ff6SDimitry Andric "Trying to access mask for block without one.");
8080c9157d92SDimitry Andric return BCEntryIt->second;
8081cdc20ff6SDimitry Andric }
8082c9157d92SDimitry Andric
createBlockInMask(BasicBlock * BB,VPlan & Plan)8083cdc20ff6SDimitry Andric void VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
8084cdc20ff6SDimitry Andric assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8085cdc20ff6SDimitry Andric assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed");
8086c9157d92SDimitry Andric assert(OrigLoop->getHeader() != BB &&
8087c9157d92SDimitry Andric "Loop header must have cached block mask");
8088c9157d92SDimitry Andric
8089c9157d92SDimitry Andric // All-one mask is modelled as no-mask following the convention for masked
8090c9157d92SDimitry Andric // load/store/gather/scatter. Initialize BlockMask to no-mask.
8091c9157d92SDimitry Andric VPValue *BlockMask = nullptr;
80920b57cec5SDimitry Andric // This is the block mask. We OR all incoming edges.
80930b57cec5SDimitry Andric for (auto *Predecessor : predecessors(BB)) {
80940b57cec5SDimitry Andric VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8095cdc20ff6SDimitry Andric if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too.
8096cdc20ff6SDimitry Andric BlockMaskCache[BB] = EdgeMask;
80976c20abcdSDimitry Andric return;
8098cdc20ff6SDimitry Andric }
80990b57cec5SDimitry Andric
81000b57cec5SDimitry Andric if (!BlockMask) { // BlockMask has its initialized nullptr value.
81010b57cec5SDimitry Andric BlockMask = EdgeMask;
81020b57cec5SDimitry Andric continue;
81030b57cec5SDimitry Andric }
81040b57cec5SDimitry Andric
81050eae32dcSDimitry Andric BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
81060b57cec5SDimitry Andric }
81070b57cec5SDimitry Andric
8108cdc20ff6SDimitry Andric BlockMaskCache[BB] = BlockMask;
81090b57cec5SDimitry Andric }
81100b57cec5SDimitry Andric
tryToWidenMemory(Instruction * I,ArrayRef<VPValue * > Operands,VFRange & Range,VPlanPtr & Plan)8111fe6060f1SDimitry Andric VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
8112fe6060f1SDimitry Andric ArrayRef<VPValue *> Operands,
8113fe6060f1SDimitry Andric VFRange &Range,
81140b57cec5SDimitry Andric VPlanPtr &Plan) {
81155ffd83dbSDimitry Andric assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
81165ffd83dbSDimitry Andric "Must be called with either a load or store");
81170b57cec5SDimitry Andric
8118e8d8bef9SDimitry Andric auto willWiden = [&](ElementCount VF) -> bool {
81190b57cec5SDimitry Andric LoopVectorizationCostModel::InstWidening Decision =
81200b57cec5SDimitry Andric CM.getWideningDecision(I, VF);
81210b57cec5SDimitry Andric assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
81220b57cec5SDimitry Andric "CM decision should be taken at this point.");
8123480093f4SDimitry Andric if (Decision == LoopVectorizationCostModel::CM_Interleave)
8124480093f4SDimitry Andric return true;
8125480093f4SDimitry Andric if (CM.isScalarAfterVectorization(I, VF) ||
8126480093f4SDimitry Andric CM.isProfitableToScalarize(I, VF))
8127480093f4SDimitry Andric return false;
81280b57cec5SDimitry Andric return Decision != LoopVectorizationCostModel::CM_Scalarize;
81290b57cec5SDimitry Andric };
81300b57cec5SDimitry Andric
81310b57cec5SDimitry Andric if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
81320b57cec5SDimitry Andric return nullptr;
81330b57cec5SDimitry Andric
81340b57cec5SDimitry Andric VPValue *Mask = nullptr;
81350b57cec5SDimitry Andric if (Legal->isMaskRequired(I))
8136cdc20ff6SDimitry Andric Mask = getBlockInMask(I->getParent());
81370b57cec5SDimitry Andric
8138349cc55cSDimitry Andric // Determine if the pointer operand of the access is either consecutive or
8139349cc55cSDimitry Andric // reverse consecutive.
8140349cc55cSDimitry Andric LoopVectorizationCostModel::InstWidening Decision =
8141349cc55cSDimitry Andric CM.getWideningDecision(I, Range.Start);
8142349cc55cSDimitry Andric bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
8143349cc55cSDimitry Andric bool Consecutive =
8144349cc55cSDimitry Andric Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
8145349cc55cSDimitry Andric
8146de8261c4SDimitry Andric VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
8147de8261c4SDimitry Andric if (Consecutive) {
8148cdc20ff6SDimitry Andric auto *GEP = dyn_cast<GetElementPtrInst>(
8149cdc20ff6SDimitry Andric Ptr->getUnderlyingValue()->stripPointerCasts());
8150cdc20ff6SDimitry Andric auto *VectorPtr = new VPVectorPointerRecipe(
8151cdc20ff6SDimitry Andric Ptr, getLoadStoreType(I), Reverse, GEP ? GEP->isInBounds() : false,
8152cdc20ff6SDimitry Andric I->getDebugLoc());
8153de8261c4SDimitry Andric Builder.getInsertBlock()->appendRecipe(VectorPtr);
8154de8261c4SDimitry Andric Ptr = VectorPtr;
8155de8261c4SDimitry Andric }
81565ffd83dbSDimitry Andric if (LoadInst *Load = dyn_cast<LoadInst>(I))
8157de8261c4SDimitry Andric return new VPWidenMemoryInstructionRecipe(*Load, Ptr, Mask, Consecutive,
8158de8261c4SDimitry Andric Reverse);
81595ffd83dbSDimitry Andric
81605ffd83dbSDimitry Andric StoreInst *Store = cast<StoreInst>(I);
8161de8261c4SDimitry Andric return new VPWidenMemoryInstructionRecipe(*Store, Ptr, Operands[0], Mask,
8162de8261c4SDimitry Andric Consecutive, Reverse);
81630b57cec5SDimitry Andric }
81640b57cec5SDimitry Andric
816581ad6265SDimitry Andric /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
816681ad6265SDimitry Andric /// insert a recipe to expand the step for the induction recipe.
8167fe013be4SDimitry Andric static VPWidenIntOrFpInductionRecipe *
createWidenInductionRecipes(PHINode * Phi,Instruction * PhiOrTrunc,VPValue * Start,const InductionDescriptor & IndDesc,VPlan & Plan,ScalarEvolution & SE,Loop & OrigLoop,VFRange & Range)8168fe013be4SDimitry Andric createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc,
8169fe013be4SDimitry Andric VPValue *Start, const InductionDescriptor &IndDesc,
8170fe013be4SDimitry Andric VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop,
8171fe013be4SDimitry Andric VFRange &Range) {
81721fd87a68SDimitry Andric assert(IndDesc.getStartValue() ==
81731fd87a68SDimitry Andric Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
817481ad6265SDimitry Andric assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
817581ad6265SDimitry Andric "step must be loop invariant");
817681ad6265SDimitry Andric
817781ad6265SDimitry Andric VPValue *Step =
817881ad6265SDimitry Andric vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE);
81791fd87a68SDimitry Andric if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8180fe013be4SDimitry Andric return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI);
81811fd87a68SDimitry Andric }
81821fd87a68SDimitry Andric assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
8183fe013be4SDimitry Andric return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc);
81841fd87a68SDimitry Andric }
81851fd87a68SDimitry Andric
tryToOptimizeInductionPHI(PHINode * Phi,ArrayRef<VPValue * > Operands,VPlan & Plan,VFRange & Range)818681ad6265SDimitry Andric VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI(
818781ad6265SDimitry Andric PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) {
81881fd87a68SDimitry Andric
81890b57cec5SDimitry Andric // Check if this is an integer or fp induction. If so, build the recipe that
81900b57cec5SDimitry Andric // produces its scalar and vector values.
81911fd87a68SDimitry Andric if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8192fe013be4SDimitry Andric return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan,
819381ad6265SDimitry Andric *PSE.getSE(), *OrigLoop, Range);
81940b57cec5SDimitry Andric
819581ad6265SDimitry Andric // Check if this is pointer induction. If so, build the recipe for it.
81966246ae0bSDimitry Andric if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
8197bdd1243dSDimitry Andric VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(),
8198bdd1243dSDimitry Andric *PSE.getSE());
81996246ae0bSDimitry Andric return new VPWidenPointerInductionRecipe(
8200bdd1243dSDimitry Andric Phi, Operands[0], Step, *II,
82016246ae0bSDimitry Andric LoopVectorizationPlanner::getDecisionAndClampRange(
82026246ae0bSDimitry Andric [&](ElementCount VF) {
82036246ae0bSDimitry Andric return CM.isScalarAfterVectorization(Phi, VF);
82046246ae0bSDimitry Andric },
82056246ae0bSDimitry Andric Range));
82066246ae0bSDimitry Andric }
82070b57cec5SDimitry Andric return nullptr;
82080b57cec5SDimitry Andric }
82090b57cec5SDimitry Andric
tryToOptimizeInductionTruncate(TruncInst * I,ArrayRef<VPValue * > Operands,VFRange & Range,VPlan & Plan)8210fe6060f1SDimitry Andric VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
821181ad6265SDimitry Andric TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) {
82120b57cec5SDimitry Andric // Optimize the special case where the source is a constant integer
82130b57cec5SDimitry Andric // induction variable. Notice that we can only optimize the 'trunc' case
82140b57cec5SDimitry Andric // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
82150b57cec5SDimitry Andric // (c) other casts depend on pointer size.
82160b57cec5SDimitry Andric
82170b57cec5SDimitry Andric // Determine whether \p K is a truncation based on an induction variable that
82180b57cec5SDimitry Andric // can be optimized.
82190b57cec5SDimitry Andric auto isOptimizableIVTruncate =
8220e8d8bef9SDimitry Andric [&](Instruction *K) -> std::function<bool(ElementCount)> {
8221e8d8bef9SDimitry Andric return [=](ElementCount VF) -> bool {
8222e8d8bef9SDimitry Andric return CM.isOptimizableIVTruncate(K, VF);
8223e8d8bef9SDimitry Andric };
82240b57cec5SDimitry Andric };
82250b57cec5SDimitry Andric
82265ffd83dbSDimitry Andric if (LoopVectorizationPlanner::getDecisionAndClampRange(
8227e8d8bef9SDimitry Andric isOptimizableIVTruncate(I), Range)) {
8228e8d8bef9SDimitry Andric
82290eae32dcSDimitry Andric auto *Phi = cast<PHINode>(I->getOperand(0));
82300eae32dcSDimitry Andric const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8231fe013be4SDimitry Andric VPValue *Start = Plan.getVPValueOrAddLiveIn(II.getStartValue());
8232fe013be4SDimitry Andric return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(),
8233fe013be4SDimitry Andric *OrigLoop, Range);
8234e8d8bef9SDimitry Andric }
82350b57cec5SDimitry Andric return nullptr;
82360b57cec5SDimitry Andric }
82370b57cec5SDimitry Andric
tryToBlend(PHINode * Phi,ArrayRef<VPValue * > Operands,VPlanPtr & Plan)8238fe6060f1SDimitry Andric VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
8239fe6060f1SDimitry Andric ArrayRef<VPValue *> Operands,
8240fe6060f1SDimitry Andric VPlanPtr &Plan) {
8241fe6060f1SDimitry Andric // If all incoming values are equal, the incoming VPValue can be used directly
8242fe6060f1SDimitry Andric // instead of creating a new VPBlendRecipe.
8243bdd1243dSDimitry Andric if (llvm::all_equal(Operands))
8244fe6060f1SDimitry Andric return Operands[0];
8245fe6060f1SDimitry Andric
824681ad6265SDimitry Andric unsigned NumIncoming = Phi->getNumIncomingValues();
824781ad6265SDimitry Andric // For in-loop reductions, we do not need to create an additional select.
824881ad6265SDimitry Andric VPValue *InLoopVal = nullptr;
824981ad6265SDimitry Andric for (unsigned In = 0; In < NumIncoming; In++) {
825081ad6265SDimitry Andric PHINode *PhiOp =
825181ad6265SDimitry Andric dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue());
825281ad6265SDimitry Andric if (PhiOp && CM.isInLoopReduction(PhiOp)) {
825381ad6265SDimitry Andric assert(!InLoopVal && "Found more than one in-loop reduction!");
825481ad6265SDimitry Andric InLoopVal = Operands[In];
825581ad6265SDimitry Andric }
825681ad6265SDimitry Andric }
825781ad6265SDimitry Andric
825881ad6265SDimitry Andric assert((!InLoopVal || NumIncoming == 2) &&
825981ad6265SDimitry Andric "Found an in-loop reduction for PHI with unexpected number of "
826081ad6265SDimitry Andric "incoming values");
826181ad6265SDimitry Andric if (InLoopVal)
826281ad6265SDimitry Andric return Operands[Operands[0] == InLoopVal ? 1 : 0];
826381ad6265SDimitry Andric
82640b57cec5SDimitry Andric // We know that all PHIs in non-header blocks are converted into selects, so
82650b57cec5SDimitry Andric // we don't have to worry about the insertion order and we can just use the
82660b57cec5SDimitry Andric // builder. At this point we generate the predication tree. There may be
82670b57cec5SDimitry Andric // duplications since this is a simple recursive scan, but future
82680b57cec5SDimitry Andric // optimizations will clean it up.
8269fe6060f1SDimitry Andric SmallVector<VPValue *, 2> OperandsWithMask;
8270fe6060f1SDimitry Andric
82710b57cec5SDimitry Andric for (unsigned In = 0; In < NumIncoming; In++) {
82720b57cec5SDimitry Andric VPValue *EdgeMask =
8273fe013be4SDimitry Andric createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), *Plan);
82740b57cec5SDimitry Andric assert((EdgeMask || NumIncoming == 1) &&
82750b57cec5SDimitry Andric "Multiple predecessors with one having a full mask");
8276fe6060f1SDimitry Andric OperandsWithMask.push_back(Operands[In]);
82770b57cec5SDimitry Andric if (EdgeMask)
8278fe6060f1SDimitry Andric OperandsWithMask.push_back(EdgeMask);
82790b57cec5SDimitry Andric }
8280fe6060f1SDimitry Andric return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask));
82810b57cec5SDimitry Andric }
82820b57cec5SDimitry Andric
tryToWidenCall(CallInst * CI,ArrayRef<VPValue * > Operands,VFRange & Range,VPlanPtr & Plan)8283fe6060f1SDimitry Andric VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8284fe6060f1SDimitry Andric ArrayRef<VPValue *> Operands,
8285fe013be4SDimitry Andric VFRange &Range,
8286fe013be4SDimitry Andric VPlanPtr &Plan) {
82870b57cec5SDimitry Andric bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
828804eeddc0SDimitry Andric [this, CI](ElementCount VF) {
828904eeddc0SDimitry Andric return CM.isScalarWithPredication(CI, VF);
829004eeddc0SDimitry Andric },
82915ffd83dbSDimitry Andric Range);
82920b57cec5SDimitry Andric
82930b57cec5SDimitry Andric if (IsPredicated)
82945ffd83dbSDimitry Andric return nullptr;
82950b57cec5SDimitry Andric
82965ffd83dbSDimitry Andric Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
82975ffd83dbSDimitry Andric if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8298e8d8bef9SDimitry Andric ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8299e8d8bef9SDimitry Andric ID == Intrinsic::pseudoprobe ||
8300e8d8bef9SDimitry Andric ID == Intrinsic::experimental_noalias_scope_decl))
83015ffd83dbSDimitry Andric return nullptr;
83025ffd83dbSDimitry Andric
8303fe013be4SDimitry Andric SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size()));
8304bdd1243dSDimitry Andric
8305bdd1243dSDimitry Andric // Is it beneficial to perform intrinsic call compared to lib call?
8306bdd1243dSDimitry Andric bool ShouldUseVectorIntrinsic =
8307bdd1243dSDimitry Andric ID && LoopVectorizationPlanner::getDecisionAndClampRange(
8308bdd1243dSDimitry Andric [&](ElementCount VF) -> bool {
8309c9157d92SDimitry Andric return CM.getCallWideningDecision(CI, VF).Kind ==
8310c9157d92SDimitry Andric LoopVectorizationCostModel::CM_IntrinsicCall;
8311bdd1243dSDimitry Andric },
8312bdd1243dSDimitry Andric Range);
8313bdd1243dSDimitry Andric if (ShouldUseVectorIntrinsic)
8314a58f00eaSDimitry Andric return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID,
8315a58f00eaSDimitry Andric CI->getDebugLoc());
8316bdd1243dSDimitry Andric
8317fe013be4SDimitry Andric Function *Variant = nullptr;
8318c9157d92SDimitry Andric std::optional<unsigned> MaskPos;
8319bdd1243dSDimitry Andric // Is better to call a vectorized version of the function than to to scalarize
8320bdd1243dSDimitry Andric // the call?
8321bdd1243dSDimitry Andric auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
8322bdd1243dSDimitry Andric [&](ElementCount VF) -> bool {
8323bdd1243dSDimitry Andric // The following case may be scalarized depending on the VF.
8324bdd1243dSDimitry Andric // The flag shows whether we can use a usual Call for vectorized
8325bdd1243dSDimitry Andric // version of the instruction.
8326fe013be4SDimitry Andric
8327fe013be4SDimitry Andric // If we've found a variant at a previous VF, then stop looking. A
8328fe013be4SDimitry Andric // vectorized variant of a function expects input in a certain shape
8329fe013be4SDimitry Andric // -- basically the number of input registers, the number of lanes
8330fe013be4SDimitry Andric // per register, and whether there's a mask required.
8331fe013be4SDimitry Andric // We store a pointer to the variant in the VPWidenCallRecipe, so
8332fe013be4SDimitry Andric // once we have an appropriate variant it's only valid for that VF.
8333fe013be4SDimitry Andric // This will force a different vplan to be generated for each VF that
8334fe013be4SDimitry Andric // finds a valid variant.
8335fe013be4SDimitry Andric if (Variant)
8336fe013be4SDimitry Andric return false;
8337c9157d92SDimitry Andric LoopVectorizationCostModel::CallWideningDecision Decision =
8338c9157d92SDimitry Andric CM.getCallWideningDecision(CI, VF);
8339c9157d92SDimitry Andric if (Decision.Kind == LoopVectorizationCostModel::CM_VectorCall) {
8340c9157d92SDimitry Andric Variant = Decision.Variant;
8341c9157d92SDimitry Andric MaskPos = Decision.MaskPos;
8342c9157d92SDimitry Andric return true;
8343c9157d92SDimitry Andric }
8344c9157d92SDimitry Andric
8345c9157d92SDimitry Andric return false;
8346bdd1243dSDimitry Andric },
8347bdd1243dSDimitry Andric Range);
8348fe013be4SDimitry Andric if (ShouldUseVectorCall) {
8349c9157d92SDimitry Andric if (MaskPos.has_value()) {
8350fe013be4SDimitry Andric // We have 2 cases that would require a mask:
8351fe013be4SDimitry Andric // 1) The block needs to be predicated, either due to a conditional
8352fe013be4SDimitry Andric // in the scalar loop or use of an active lane mask with
8353fe013be4SDimitry Andric // tail-folding, and we use the appropriate mask for the block.
8354fe013be4SDimitry Andric // 2) No mask is required for the block, but the only available
8355fe013be4SDimitry Andric // vector variant at this VF requires a mask, so we synthesize an
8356fe013be4SDimitry Andric // all-true mask.
8357fe013be4SDimitry Andric VPValue *Mask = nullptr;
8358fe013be4SDimitry Andric if (Legal->isMaskRequired(CI))
8359cdc20ff6SDimitry Andric Mask = getBlockInMask(CI->getParent());
8360fe013be4SDimitry Andric else
8361fe013be4SDimitry Andric Mask = Plan->getVPValueOrAddLiveIn(ConstantInt::getTrue(
8362fe013be4SDimitry Andric IntegerType::getInt1Ty(Variant->getFunctionType()->getContext())));
8363fe013be4SDimitry Andric
8364c9157d92SDimitry Andric Ops.insert(Ops.begin() + *MaskPos, Mask);
8365fe013be4SDimitry Andric }
8366fe013be4SDimitry Andric
8367bdd1243dSDimitry Andric return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()),
8368a58f00eaSDimitry Andric Intrinsic::not_intrinsic, CI->getDebugLoc(),
8369a58f00eaSDimitry Andric Variant);
8370fe013be4SDimitry Andric }
8371bdd1243dSDimitry Andric
8372bdd1243dSDimitry Andric return nullptr;
83735ffd83dbSDimitry Andric }
83745ffd83dbSDimitry Andric
shouldWiden(Instruction * I,VFRange & Range) const83755ffd83dbSDimitry Andric bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
83765ffd83dbSDimitry Andric assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
83775ffd83dbSDimitry Andric !isa<StoreInst>(I) && "Instruction should have been handled earlier");
83785ffd83dbSDimitry Andric // Instruction should be widened, unless it is scalar after vectorization,
83795ffd83dbSDimitry Andric // scalarization is profitable or it is predicated.
8380e8d8bef9SDimitry Andric auto WillScalarize = [this, I](ElementCount VF) -> bool {
83815ffd83dbSDimitry Andric return CM.isScalarAfterVectorization(I, VF) ||
838204eeddc0SDimitry Andric CM.isProfitableToScalarize(I, VF) ||
838304eeddc0SDimitry Andric CM.isScalarWithPredication(I, VF);
83845ffd83dbSDimitry Andric };
83855ffd83dbSDimitry Andric return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
83865ffd83dbSDimitry Andric Range);
83875ffd83dbSDimitry Andric }
83885ffd83dbSDimitry Andric
tryToWiden(Instruction * I,ArrayRef<VPValue * > Operands,VPBasicBlock * VPBB,VPlanPtr & Plan)8389bdd1243dSDimitry Andric VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I,
8390bdd1243dSDimitry Andric ArrayRef<VPValue *> Operands,
8391bdd1243dSDimitry Andric VPBasicBlock *VPBB, VPlanPtr &Plan) {
8392bdd1243dSDimitry Andric switch (I->getOpcode()) {
8393bdd1243dSDimitry Andric default:
8394bdd1243dSDimitry Andric return nullptr;
8395bdd1243dSDimitry Andric case Instruction::SDiv:
8396bdd1243dSDimitry Andric case Instruction::UDiv:
8397bdd1243dSDimitry Andric case Instruction::SRem:
8398bdd1243dSDimitry Andric case Instruction::URem: {
8399bdd1243dSDimitry Andric // If not provably safe, use a select to form a safe divisor before widening the
8400bdd1243dSDimitry Andric // div/rem operation itself. Otherwise fall through to general handling below.
8401bdd1243dSDimitry Andric if (CM.isPredicatedInst(I)) {
8402bdd1243dSDimitry Andric SmallVector<VPValue *> Ops(Operands.begin(), Operands.end());
8403cdc20ff6SDimitry Andric VPValue *Mask = getBlockInMask(I->getParent());
8404fe013be4SDimitry Andric VPValue *One = Plan->getVPValueOrAddLiveIn(
8405fe013be4SDimitry Andric ConstantInt::get(I->getType(), 1u, false));
8406bdd1243dSDimitry Andric auto *SafeRHS =
8407bdd1243dSDimitry Andric new VPInstruction(Instruction::Select, {Mask, Ops[1], One},
8408bdd1243dSDimitry Andric I->getDebugLoc());
8409bdd1243dSDimitry Andric VPBB->appendRecipe(SafeRHS);
8410bdd1243dSDimitry Andric Ops[1] = SafeRHS;
8411bdd1243dSDimitry Andric return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end()));
8412bdd1243dSDimitry Andric }
8413fe013be4SDimitry Andric [[fallthrough]];
8414bdd1243dSDimitry Andric }
84150b57cec5SDimitry Andric case Instruction::Add:
84160b57cec5SDimitry Andric case Instruction::And:
84170b57cec5SDimitry Andric case Instruction::AShr:
84180b57cec5SDimitry Andric case Instruction::FAdd:
84190b57cec5SDimitry Andric case Instruction::FCmp:
84200b57cec5SDimitry Andric case Instruction::FDiv:
84210b57cec5SDimitry Andric case Instruction::FMul:
84220b57cec5SDimitry Andric case Instruction::FNeg:
84230b57cec5SDimitry Andric case Instruction::FRem:
84240b57cec5SDimitry Andric case Instruction::FSub:
84250b57cec5SDimitry Andric case Instruction::ICmp:
84260b57cec5SDimitry Andric case Instruction::LShr:
84270b57cec5SDimitry Andric case Instruction::Mul:
84280b57cec5SDimitry Andric case Instruction::Or:
84290b57cec5SDimitry Andric case Instruction::Select:
84300b57cec5SDimitry Andric case Instruction::Shl:
84310b57cec5SDimitry Andric case Instruction::Sub:
84320b57cec5SDimitry Andric case Instruction::Xor:
843381ad6265SDimitry Andric case Instruction::Freeze:
8434fe6060f1SDimitry Andric return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8435bdd1243dSDimitry Andric };
8436fe6060f1SDimitry Andric }
8437fe6060f1SDimitry Andric
fixHeaderPhis()8438fe6060f1SDimitry Andric void VPRecipeBuilder::fixHeaderPhis() {
8439fe6060f1SDimitry Andric BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
844004eeddc0SDimitry Andric for (VPHeaderPHIRecipe *R : PhisToFix) {
8441fe6060f1SDimitry Andric auto *PN = cast<PHINode>(R->getUnderlyingValue());
8442fe6060f1SDimitry Andric VPRecipeBase *IncR =
8443fe6060f1SDimitry Andric getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8444fe6060f1SDimitry Andric R->addOperand(IncR->getVPSingleValue());
8445fe6060f1SDimitry Andric }
84460b57cec5SDimitry Andric }
84470b57cec5SDimitry Andric
handleReplication(Instruction * I,VFRange & Range,VPlan & Plan)8448fe013be4SDimitry Andric VPRecipeOrVPValueTy VPRecipeBuilder::handleReplication(Instruction *I,
8449fe013be4SDimitry Andric VFRange &Range,
8450fe013be4SDimitry Andric VPlan &Plan) {
84510b57cec5SDimitry Andric bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8452e8d8bef9SDimitry Andric [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
84530b57cec5SDimitry Andric Range);
84540b57cec5SDimitry Andric
8455bdd1243dSDimitry Andric bool IsPredicated = CM.isPredicatedInst(I);
84560b57cec5SDimitry Andric
84576e75b2fbSDimitry Andric // Even if the instruction is not marked as uniform, there are certain
84586e75b2fbSDimitry Andric // intrinsic calls that can be effectively treated as such, so we check for
84596e75b2fbSDimitry Andric // them here. Conservatively, we only do this for scalable vectors, since
84606e75b2fbSDimitry Andric // for fixed-width VFs we can always fall back on full scalarization.
84616e75b2fbSDimitry Andric if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
84626e75b2fbSDimitry Andric switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
84636e75b2fbSDimitry Andric case Intrinsic::assume:
84646e75b2fbSDimitry Andric case Intrinsic::lifetime_start:
84656e75b2fbSDimitry Andric case Intrinsic::lifetime_end:
84666e75b2fbSDimitry Andric // For scalable vectors if one of the operands is variant then we still
84676e75b2fbSDimitry Andric // want to mark as uniform, which will generate one instruction for just
84686e75b2fbSDimitry Andric // the first lane of the vector. We can't scalarize the call in the same
84696e75b2fbSDimitry Andric // way as for fixed-width vectors because we don't know how many lanes
84706e75b2fbSDimitry Andric // there are.
84716e75b2fbSDimitry Andric //
84726e75b2fbSDimitry Andric // The reasons for doing it this way for scalable vectors are:
84736e75b2fbSDimitry Andric // 1. For the assume intrinsic generating the instruction for the first
84746e75b2fbSDimitry Andric // lane is still be better than not generating any at all. For
84756e75b2fbSDimitry Andric // example, the input may be a splat across all lanes.
84766e75b2fbSDimitry Andric // 2. For the lifetime start/end intrinsics the pointer operand only
84776e75b2fbSDimitry Andric // does anything useful when the input comes from a stack object,
84786e75b2fbSDimitry Andric // which suggests it should always be uniform. For non-stack objects
84796e75b2fbSDimitry Andric // the effect is to poison the object, which still allows us to
84806e75b2fbSDimitry Andric // remove the call.
84816e75b2fbSDimitry Andric IsUniform = true;
84826e75b2fbSDimitry Andric break;
84836e75b2fbSDimitry Andric default:
84846e75b2fbSDimitry Andric break;
84856e75b2fbSDimitry Andric }
84866e75b2fbSDimitry Andric }
8487fe013be4SDimitry Andric VPValue *BlockInMask = nullptr;
84880b57cec5SDimitry Andric if (!IsPredicated) {
8489fe013be4SDimitry Andric // Finalize the recipe for Instr, first if it is not predicated.
84900b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8491fcaf7f86SDimitry Andric } else {
8492fe013be4SDimitry Andric LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8493fe013be4SDimitry Andric // Instructions marked for predication are replicated and a mask operand is
8494fe013be4SDimitry Andric // added initially. Masked replicate recipes will later be placed under an
8495fe013be4SDimitry Andric // if-then construct to prevent side-effects. Generate recipes to compute
8496fe013be4SDimitry Andric // the block mask for this region.
8497cdc20ff6SDimitry Andric BlockInMask = getBlockInMask(I->getParent());
8498fe6060f1SDimitry Andric }
8499fcaf7f86SDimitry Andric
8500fe013be4SDimitry Andric auto *Recipe = new VPReplicateRecipe(I, Plan.mapToVPValues(I->operands()),
8501fe013be4SDimitry Andric IsUniform, BlockInMask);
8502fe013be4SDimitry Andric return toVPRecipeResult(Recipe);
85030b57cec5SDimitry Andric }
85040b57cec5SDimitry Andric
8505fe6060f1SDimitry Andric VPRecipeOrVPValueTy
tryToCreateWidenRecipe(Instruction * Instr,ArrayRef<VPValue * > Operands,VFRange & Range,VPBasicBlock * VPBB,VPlanPtr & Plan)8506fe6060f1SDimitry Andric VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8507fe6060f1SDimitry Andric ArrayRef<VPValue *> Operands,
8508bdd1243dSDimitry Andric VFRange &Range, VPBasicBlock *VPBB,
8509bdd1243dSDimitry Andric VPlanPtr &Plan) {
851081ad6265SDimitry Andric // First, check for specific widening recipes that deal with inductions, Phi
851181ad6265SDimitry Andric // nodes, calls and memory operations.
85125ffd83dbSDimitry Andric VPRecipeBase *Recipe;
85135ffd83dbSDimitry Andric if (auto Phi = dyn_cast<PHINode>(Instr)) {
85145ffd83dbSDimitry Andric if (Phi->getParent() != OrigLoop->getHeader())
8515fe6060f1SDimitry Andric return tryToBlend(Phi, Operands, Plan);
8516bdd1243dSDimitry Andric
8517bdd1243dSDimitry Andric // Always record recipes for header phis. Later first-order recurrence phis
8518bdd1243dSDimitry Andric // can have earlier phis as incoming values.
8519bdd1243dSDimitry Andric recordRecipeOf(Phi);
8520bdd1243dSDimitry Andric
852181ad6265SDimitry Andric if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range)))
8522fe6060f1SDimitry Andric return toVPRecipeResult(Recipe);
8523e8d8bef9SDimitry Andric
852404eeddc0SDimitry Andric VPHeaderPHIRecipe *PhiRecipe = nullptr;
852581ad6265SDimitry Andric assert((Legal->isReductionVariable(Phi) ||
8526bdd1243dSDimitry Andric Legal->isFixedOrderRecurrence(Phi)) &&
8527bdd1243dSDimitry Andric "can only widen reductions and fixed-order recurrences here");
8528fe6060f1SDimitry Andric VPValue *StartV = Operands[0];
8529e8d8bef9SDimitry Andric if (Legal->isReductionVariable(Phi)) {
85300eae32dcSDimitry Andric const RecurrenceDescriptor &RdxDesc =
85310eae32dcSDimitry Andric Legal->getReductionVars().find(Phi)->second;
8532fe6060f1SDimitry Andric assert(RdxDesc.getRecurrenceStartValue() ==
8533fe6060f1SDimitry Andric Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8534fe6060f1SDimitry Andric PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8535fe6060f1SDimitry Andric CM.isInLoopReduction(Phi),
8536fe6060f1SDimitry Andric CM.useOrderedReductions(RdxDesc));
8537fe6060f1SDimitry Andric } else {
8538bdd1243dSDimitry Andric // TODO: Currently fixed-order recurrences are modeled as chains of
8539bdd1243dSDimitry Andric // first-order recurrences. If there are no users of the intermediate
8540bdd1243dSDimitry Andric // recurrences in the chain, the fixed order recurrence should be modeled
8541bdd1243dSDimitry Andric // directly, enabling more efficient codegen.
8542fe6060f1SDimitry Andric PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8543e8d8bef9SDimitry Andric }
8544e8d8bef9SDimitry Andric
8545fe6060f1SDimitry Andric // Record the incoming value from the backedge, so we can add the incoming
8546fe6060f1SDimitry Andric // value from the backedge after all recipes have been created.
8547bdd1243dSDimitry Andric auto *Inc = cast<Instruction>(
8548bdd1243dSDimitry Andric Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
8549bdd1243dSDimitry Andric auto RecipeIter = Ingredient2Recipe.find(Inc);
8550bdd1243dSDimitry Andric if (RecipeIter == Ingredient2Recipe.end())
8551bdd1243dSDimitry Andric recordRecipeOf(Inc);
8552bdd1243dSDimitry Andric
8553fe6060f1SDimitry Andric PhisToFix.push_back(PhiRecipe);
8554fe6060f1SDimitry Andric return toVPRecipeResult(PhiRecipe);
8555fe6060f1SDimitry Andric }
8556fe6060f1SDimitry Andric
8557fe6060f1SDimitry Andric if (isa<TruncInst>(Instr) &&
8558fe6060f1SDimitry Andric (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,
8559fe6060f1SDimitry Andric Range, *Plan)))
8560fe6060f1SDimitry Andric return toVPRecipeResult(Recipe);
85615ffd83dbSDimitry Andric
856281ad6265SDimitry Andric // All widen recipes below deal only with VF > 1.
856381ad6265SDimitry Andric if (LoopVectorizationPlanner::getDecisionAndClampRange(
856481ad6265SDimitry Andric [&](ElementCount VF) { return VF.isScalar(); }, Range))
856581ad6265SDimitry Andric return nullptr;
856681ad6265SDimitry Andric
856781ad6265SDimitry Andric if (auto *CI = dyn_cast<CallInst>(Instr))
8568fe013be4SDimitry Andric return toVPRecipeResult(tryToWidenCall(CI, Operands, Range, Plan));
856981ad6265SDimitry Andric
857081ad6265SDimitry Andric if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
857181ad6265SDimitry Andric return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
857281ad6265SDimitry Andric
85735ffd83dbSDimitry Andric if (!shouldWiden(Instr, Range))
85745ffd83dbSDimitry Andric return nullptr;
85755ffd83dbSDimitry Andric
85765ffd83dbSDimitry Andric if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8577fe6060f1SDimitry Andric return toVPRecipeResult(new VPWidenGEPRecipe(
8578fe013be4SDimitry Andric GEP, make_range(Operands.begin(), Operands.end())));
85795ffd83dbSDimitry Andric
85805ffd83dbSDimitry Andric if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8581fe6060f1SDimitry Andric return toVPRecipeResult(new VPWidenSelectRecipe(
8582fe013be4SDimitry Andric *SI, make_range(Operands.begin(), Operands.end())));
8583fe013be4SDimitry Andric }
8584fe013be4SDimitry Andric
8585fe013be4SDimitry Andric if (auto *CI = dyn_cast<CastInst>(Instr)) {
8586c9157d92SDimitry Andric return toVPRecipeResult(new VPWidenCastRecipe(CI->getOpcode(), Operands[0],
8587c9157d92SDimitry Andric CI->getType(), *CI));
85880b57cec5SDimitry Andric }
85890b57cec5SDimitry Andric
8590bdd1243dSDimitry Andric return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan));
85910b57cec5SDimitry Andric }
85920b57cec5SDimitry Andric
buildVPlansWithVPRecipes(ElementCount MinVF,ElementCount MaxVF)8593e8d8bef9SDimitry Andric void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8594e8d8bef9SDimitry Andric ElementCount MaxVF) {
8595e8d8bef9SDimitry Andric assert(OrigLoop->isInnermost() && "Inner loop expected.");
85960b57cec5SDimitry Andric
8597fe013be4SDimitry Andric auto MaxVFTimes2 = MaxVF * 2;
8598fe013be4SDimitry Andric for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
8599fe013be4SDimitry Andric VFRange SubRange = {VF, MaxVFTimes2};
8600c9157d92SDimitry Andric if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) {
8601c9157d92SDimitry Andric // Now optimize the initial VPlan.
8602c9157d92SDimitry Andric if (!Plan->hasVF(ElementCount::getFixed(1)))
8603c9157d92SDimitry Andric VPlanTransforms::truncateToMinimalBitwidths(
8604c9157d92SDimitry Andric *Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext());
8605c9157d92SDimitry Andric VPlanTransforms::optimize(*Plan, *PSE.getSE());
8606c9157d92SDimitry Andric assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
8607c9157d92SDimitry Andric VPlans.push_back(std::move(Plan));
8608c9157d92SDimitry Andric }
86090b57cec5SDimitry Andric VF = SubRange.End;
86100b57cec5SDimitry Andric }
86110b57cec5SDimitry Andric }
86120b57cec5SDimitry Andric
8613753f127fSDimitry Andric // Add the necessary canonical IV and branch recipes required to control the
8614753f127fSDimitry Andric // loop.
addCanonicalIVRecipes(VPlan & Plan,Type * IdxTy,bool HasNUW,DebugLoc DL)8615c9157d92SDimitry Andric static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
8616c9157d92SDimitry Andric DebugLoc DL) {
861704eeddc0SDimitry Andric Value *StartIdx = ConstantInt::get(IdxTy, 0);
8618fe013be4SDimitry Andric auto *StartV = Plan.getVPValueOrAddLiveIn(StartIdx);
861904eeddc0SDimitry Andric
8620753f127fSDimitry Andric // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
862104eeddc0SDimitry Andric auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
862204eeddc0SDimitry Andric VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
862304eeddc0SDimitry Andric VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
862404eeddc0SDimitry Andric Header->insert(CanonicalIVPHI, Header->begin());
862504eeddc0SDimitry Andric
8626753f127fSDimitry Andric // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar
8627753f127fSDimitry Andric // IV by VF * UF.
862804eeddc0SDimitry Andric auto *CanonicalIVIncrement =
8629c9157d92SDimitry Andric new VPInstruction(Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()},
8630c9157d92SDimitry Andric {HasNUW, false}, DL, "index.next");
863104eeddc0SDimitry Andric CanonicalIVPHI->addOperand(CanonicalIVIncrement);
863204eeddc0SDimitry Andric
863381ad6265SDimitry Andric VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
8634fe013be4SDimitry Andric EB->appendRecipe(CanonicalIVIncrement);
8635fe013be4SDimitry Andric
8636753f127fSDimitry Andric // Add the BranchOnCount VPInstruction to the latch.
8637c9157d92SDimitry Andric VPInstruction *BranchBack =
8638c9157d92SDimitry Andric new VPInstruction(VPInstruction::BranchOnCount,
863904eeddc0SDimitry Andric {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
8640753f127fSDimitry Andric EB->appendRecipe(BranchBack);
8641753f127fSDimitry Andric }
864204eeddc0SDimitry Andric
864381ad6265SDimitry Andric // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
864481ad6265SDimitry Andric // original exit block.
addUsersInExitBlock(VPBasicBlock * HeaderVPBB,Loop * OrigLoop,VPlan & Plan)8645c9157d92SDimitry Andric static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop,
864681ad6265SDimitry Andric VPlan &Plan) {
864781ad6265SDimitry Andric BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock();
864881ad6265SDimitry Andric BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
864981ad6265SDimitry Andric // Only handle single-exit loops with unique exit blocks for now.
865081ad6265SDimitry Andric if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB)
865181ad6265SDimitry Andric return;
865281ad6265SDimitry Andric
865381ad6265SDimitry Andric // Introduce VPUsers modeling the exit values.
865481ad6265SDimitry Andric for (PHINode &ExitPhi : ExitBB->phis()) {
865581ad6265SDimitry Andric Value *IncomingValue =
865681ad6265SDimitry Andric ExitPhi.getIncomingValueForBlock(ExitingBB);
8657fe013be4SDimitry Andric VPValue *V = Plan.getVPValueOrAddLiveIn(IncomingValue);
865881ad6265SDimitry Andric Plan.addLiveOut(&ExitPhi, V);
865981ad6265SDimitry Andric }
866081ad6265SDimitry Andric }
866181ad6265SDimitry Andric
8662c9157d92SDimitry Andric VPlanPtr
tryToBuildVPlanWithVPRecipes(VFRange & Range)8663c9157d92SDimitry Andric LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
86640b57cec5SDimitry Andric
8665480093f4SDimitry Andric SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8666480093f4SDimitry Andric
86675ffd83dbSDimitry Andric VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
8668480093f4SDimitry Andric
8669480093f4SDimitry Andric // ---------------------------------------------------------------------------
8670480093f4SDimitry Andric // Pre-construction: record ingredients whose recipes we'll need to further
8671480093f4SDimitry Andric // process after constructing the initial VPlan.
8672480093f4SDimitry Andric // ---------------------------------------------------------------------------
8673480093f4SDimitry Andric
8674480093f4SDimitry Andric // For each interleave group which is relevant for this (possibly trimmed)
8675480093f4SDimitry Andric // Range, add it to the set of groups to be later applied to the VPlan and add
8676480093f4SDimitry Andric // placeholders for its members' Recipes which we'll be replacing with a
8677480093f4SDimitry Andric // single VPInterleaveRecipe.
8678480093f4SDimitry Andric for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8679e8d8bef9SDimitry Andric auto applyIG = [IG, this](ElementCount VF) -> bool {
8680fe013be4SDimitry Andric bool Result = (VF.isVector() && // Query is illegal for VF == 1
8681480093f4SDimitry Andric CM.getWideningDecision(IG->getInsertPos(), VF) ==
8682480093f4SDimitry Andric LoopVectorizationCostModel::CM_Interleave);
8683fe013be4SDimitry Andric // For scalable vectors, the only interleave factor currently supported
8684fe013be4SDimitry Andric // is 2 since we require the (de)interleave2 intrinsics instead of
8685fe013be4SDimitry Andric // shufflevectors.
8686fe013be4SDimitry Andric assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
8687fe013be4SDimitry Andric "Unsupported interleave factor for scalable vectors");
8688fe013be4SDimitry Andric return Result;
8689480093f4SDimitry Andric };
8690480093f4SDimitry Andric if (!getDecisionAndClampRange(applyIG, Range))
8691480093f4SDimitry Andric continue;
8692480093f4SDimitry Andric InterleaveGroups.insert(IG);
8693480093f4SDimitry Andric for (unsigned i = 0; i < IG->getFactor(); i++)
8694480093f4SDimitry Andric if (Instruction *Member = IG->getMember(i))
8695480093f4SDimitry Andric RecipeBuilder.recordRecipeOf(Member);
8696480093f4SDimitry Andric };
8697480093f4SDimitry Andric
8698480093f4SDimitry Andric // ---------------------------------------------------------------------------
8699480093f4SDimitry Andric // Build initial VPlan: Scan the body of the loop in a topological order to
8700480093f4SDimitry Andric // visit each basic block after having visited its predecessor basic blocks.
8701480093f4SDimitry Andric // ---------------------------------------------------------------------------
87020b57cec5SDimitry Andric
8703fe013be4SDimitry Andric // Create initial VPlan skeleton, having a basic block for the pre-header
8704fe013be4SDimitry Andric // which contains SCEV expansions that need to happen before the CFG is
8705fe013be4SDimitry Andric // modified; a basic block for the vector pre-header, followed by a region for
8706fe013be4SDimitry Andric // the vector loop, followed by the middle basic block. The skeleton vector
8707fe013be4SDimitry Andric // loop region contains a header and latch basic blocks.
8708fe013be4SDimitry Andric VPlanPtr Plan = VPlan::createInitialVPlan(
8709fe013be4SDimitry Andric createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
8710fe013be4SDimitry Andric *PSE.getSE());
871181ad6265SDimitry Andric VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
87120eae32dcSDimitry Andric VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
87130eae32dcSDimitry Andric VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
8714c9157d92SDimitry Andric Plan->getVectorLoopRegion()->setEntry(HeaderVPBB);
8715c9157d92SDimitry Andric Plan->getVectorLoopRegion()->setExiting(LatchVPBB);
87160b57cec5SDimitry Andric
8717fe013be4SDimitry Andric // Don't use getDecisionAndClampRange here, because we don't know the UF
8718fe013be4SDimitry Andric // so this function is better to be conservative, rather than to split
8719fe013be4SDimitry Andric // it up into different VPlans.
8720c9157d92SDimitry Andric // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
8721fe013be4SDimitry Andric bool IVUpdateMayOverflow = false;
8722fe013be4SDimitry Andric for (ElementCount VF : Range)
8723fe013be4SDimitry Andric IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
8724fe013be4SDimitry Andric
8725c9157d92SDimitry Andric DebugLoc DL = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
8726c9157d92SDimitry Andric TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
8727c9157d92SDimitry Andric // When not folding the tail, we know that the induction increment will not
8728c9157d92SDimitry Andric // overflow.
8729c9157d92SDimitry Andric bool HasNUW = Style == TailFoldingStyle::None;
8730c9157d92SDimitry Andric addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
8731c9157d92SDimitry Andric
87320b57cec5SDimitry Andric // Scan the body of the loop in a topological order to visit each basic block
87330b57cec5SDimitry Andric // after having visited its predecessor basic blocks.
87340b57cec5SDimitry Andric LoopBlocksDFS DFS(OrigLoop);
87350b57cec5SDimitry Andric DFS.perform(LI);
87360b57cec5SDimitry Andric
87370eae32dcSDimitry Andric VPBasicBlock *VPBB = HeaderVPBB;
8738cdc20ff6SDimitry Andric bool NeedsMasks = CM.foldTailByMasking() ||
8739cdc20ff6SDimitry Andric any_of(OrigLoop->blocks(), [this](BasicBlock *BB) {
8740cdc20ff6SDimitry Andric return Legal->blockNeedsPredication(BB);
8741cdc20ff6SDimitry Andric });
87420b57cec5SDimitry Andric for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
87430b57cec5SDimitry Andric // Relevant instructions from basic block BB will be grouped into VPRecipe
87440b57cec5SDimitry Andric // ingredients and fill a new VPBasicBlock.
874581ad6265SDimitry Andric if (VPBB != HeaderVPBB)
87460eae32dcSDimitry Andric VPBB->setName(BB->getName());
87470b57cec5SDimitry Andric Builder.setInsertPoint(VPBB);
87480b57cec5SDimitry Andric
8749cdc20ff6SDimitry Andric if (VPBB == HeaderVPBB)
8750cdc20ff6SDimitry Andric RecipeBuilder.createHeaderMask(*Plan);
8751cdc20ff6SDimitry Andric else if (NeedsMasks)
8752cdc20ff6SDimitry Andric RecipeBuilder.createBlockInMask(BB, *Plan);
8753cdc20ff6SDimitry Andric
8754480093f4SDimitry Andric // Introduce each ingredient into VPlan.
875581ad6265SDimitry Andric // TODO: Model and preserve debug intrinsics in VPlan.
8756c9157d92SDimitry Andric for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
87570b57cec5SDimitry Andric Instruction *Instr = &I;
8758fe6060f1SDimitry Andric SmallVector<VPValue *, 4> Operands;
8759fe6060f1SDimitry Andric auto *Phi = dyn_cast<PHINode>(Instr);
8760fe6060f1SDimitry Andric if (Phi && Phi->getParent() == OrigLoop->getHeader()) {
8761fe013be4SDimitry Andric Operands.push_back(Plan->getVPValueOrAddLiveIn(
8762fe6060f1SDimitry Andric Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
8763fe6060f1SDimitry Andric } else {
8764fe6060f1SDimitry Andric auto OpRange = Plan->mapToVPValues(Instr->operands());
8765fe6060f1SDimitry Andric Operands = {OpRange.begin(), OpRange.end()};
8766fe6060f1SDimitry Andric }
876781ad6265SDimitry Andric
876881ad6265SDimitry Andric // Invariant stores inside loop will be deleted and a single store
876981ad6265SDimitry Andric // with the final reduction value will be added to the exit block
877081ad6265SDimitry Andric StoreInst *SI;
877181ad6265SDimitry Andric if ((SI = dyn_cast<StoreInst>(&I)) &&
877281ad6265SDimitry Andric Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
877381ad6265SDimitry Andric continue;
877481ad6265SDimitry Andric
8775fe013be4SDimitry Andric auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
8776fe013be4SDimitry Andric Instr, Operands, Range, VPBB, Plan);
8777fe013be4SDimitry Andric if (!RecipeOrValue)
8778fe013be4SDimitry Andric RecipeOrValue = RecipeBuilder.handleReplication(Instr, Range, *Plan);
8779fe6060f1SDimitry Andric // If Instr can be simplified to an existing VPValue, use it.
8780fe013be4SDimitry Andric if (isa<VPValue *>(RecipeOrValue)) {
8781fe013be4SDimitry Andric auto *VPV = cast<VPValue *>(RecipeOrValue);
8782fe6060f1SDimitry Andric Plan->addVPValue(Instr, VPV);
8783fe6060f1SDimitry Andric // If the re-used value is a recipe, register the recipe for the
8784fe6060f1SDimitry Andric // instruction, in case the recipe for Instr needs to be recorded.
8785bdd1243dSDimitry Andric if (VPRecipeBase *R = VPV->getDefiningRecipe())
8786fe6060f1SDimitry Andric RecipeBuilder.setRecipe(Instr, R);
8787fe6060f1SDimitry Andric continue;
8788fe6060f1SDimitry Andric }
8789fe6060f1SDimitry Andric // Otherwise, add the new recipe.
8790fe013be4SDimitry Andric VPRecipeBase *Recipe = cast<VPRecipeBase *>(RecipeOrValue);
8791e8d8bef9SDimitry Andric for (auto *Def : Recipe->definedValues()) {
8792e8d8bef9SDimitry Andric auto *UV = Def->getUnderlyingValue();
8793e8d8bef9SDimitry Andric Plan->addVPValue(UV, Def);
8794e8d8bef9SDimitry Andric }
8795e8d8bef9SDimitry Andric
8796fe013be4SDimitry Andric RecipeBuilder.setRecipe(Instr, Recipe);
8797c9157d92SDimitry Andric if (isa<VPHeaderPHIRecipe>(Recipe)) {
8798c9157d92SDimitry Andric // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In
8799c9157d92SDimitry Andric // the following cases, VPHeaderPHIRecipes may be created after non-phi
8800c9157d92SDimitry Andric // recipes and need to be moved to the phi section of HeaderVPBB:
8801c9157d92SDimitry Andric // * tail-folding (non-phi recipes computing the header mask are
8802c9157d92SDimitry Andric // introduced earlier than regular header phi recipes, and should appear
8803c9157d92SDimitry Andric // after them)
8804c9157d92SDimitry Andric // * Optimizing truncates to VPWidenIntOrFpInductionRecipe.
8805c9157d92SDimitry Andric
8806c9157d92SDimitry Andric assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() ||
8807c9157d92SDimitry Andric CM.foldTailByMasking() || isa<TruncInst>(Instr)) &&
8808c9157d92SDimitry Andric "unexpected recipe needs moving");
8809fe013be4SDimitry Andric Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
8810fe013be4SDimitry Andric } else
88115ffd83dbSDimitry Andric VPBB->appendRecipe(Recipe);
88120b57cec5SDimitry Andric }
88130eae32dcSDimitry Andric
88140eae32dcSDimitry Andric VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
88150eae32dcSDimitry Andric VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
88160b57cec5SDimitry Andric }
88170b57cec5SDimitry Andric
88180eae32dcSDimitry Andric // After here, VPBB should not be used.
88190eae32dcSDimitry Andric VPBB = nullptr;
88200eae32dcSDimitry Andric
8821fe013be4SDimitry Andric if (CM.requiresScalarEpilogue(Range)) {
8822fe013be4SDimitry Andric // No edge from the middle block to the unique exit block has been inserted
8823fe013be4SDimitry Andric // and there is nothing to fix from vector loop; phis should have incoming
8824fe013be4SDimitry Andric // from scalar loop only.
8825fe013be4SDimitry Andric } else
8826c9157d92SDimitry Andric addUsersInExitBlock(HeaderVPBB, OrigLoop, *Plan);
882781ad6265SDimitry Andric
882881ad6265SDimitry Andric assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
882981ad6265SDimitry Andric !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
88304824e7fdSDimitry Andric "entry block must be set to a VPRegionBlock having a non-empty entry "
88314824e7fdSDimitry Andric "VPBasicBlock");
8832fe6060f1SDimitry Andric RecipeBuilder.fixHeaderPhis();
8833fe6060f1SDimitry Andric
8834480093f4SDimitry Andric // ---------------------------------------------------------------------------
8835480093f4SDimitry Andric // Transform initial VPlan: Apply previously taken decisions, in order, to
8836480093f4SDimitry Andric // bring the VPlan to its final state.
8837480093f4SDimitry Andric // ---------------------------------------------------------------------------
8838480093f4SDimitry Andric
8839349cc55cSDimitry Andric // Adjust the recipes for any inloop reductions.
8840c9157d92SDimitry Andric adjustRecipesForReductions(LatchVPBB, Plan, RecipeBuilder, Range.Start);
8841349cc55cSDimitry Andric
8842480093f4SDimitry Andric // Interleave memory: for each Interleave Group we marked earlier as relevant
8843480093f4SDimitry Andric // for this VPlan, replace the Recipes widening its memory instructions with a
8844480093f4SDimitry Andric // single VPInterleaveRecipe at its insertion point.
8845bdd1243dSDimitry Andric for (const auto *IG : InterleaveGroups) {
8846480093f4SDimitry Andric auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
8847480093f4SDimitry Andric RecipeBuilder.getRecipe(IG->getInsertPos()));
8848e8d8bef9SDimitry Andric SmallVector<VPValue *, 4> StoredValues;
8849e8d8bef9SDimitry Andric for (unsigned i = 0; i < IG->getFactor(); ++i)
8850fe6060f1SDimitry Andric if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
8851fe6060f1SDimitry Andric auto *StoreR =
8852fe6060f1SDimitry Andric cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI));
8853fe6060f1SDimitry Andric StoredValues.push_back(StoreR->getStoredValue());
8854fe6060f1SDimitry Andric }
8855480093f4SDimitry Andric
8856fe013be4SDimitry Andric bool NeedsMaskForGaps =
8857fe013be4SDimitry Andric IG->requiresScalarEpilogue() && !CM.isScalarEpilogueAllowed();
8858e8d8bef9SDimitry Andric auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
8859fe013be4SDimitry Andric Recipe->getMask(), NeedsMaskForGaps);
8860e8d8bef9SDimitry Andric VPIG->insertBefore(Recipe);
8861e8d8bef9SDimitry Andric unsigned J = 0;
8862480093f4SDimitry Andric for (unsigned i = 0; i < IG->getFactor(); ++i)
8863480093f4SDimitry Andric if (Instruction *Member = IG->getMember(i)) {
8864fe013be4SDimitry Andric VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
8865e8d8bef9SDimitry Andric if (!Member->getType()->isVoidTy()) {
8866fe013be4SDimitry Andric VPValue *OriginalV = MemberR->getVPSingleValue();
8867e8d8bef9SDimitry Andric OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
8868e8d8bef9SDimitry Andric J++;
8869e8d8bef9SDimitry Andric }
8870fe013be4SDimitry Andric MemberR->eraseFromParent();
8871480093f4SDimitry Andric }
8872480093f4SDimitry Andric }
8873480093f4SDimitry Andric
8874fe013be4SDimitry Andric for (ElementCount VF : Range)
88750b57cec5SDimitry Andric Plan->addVF(VF);
8876bdd1243dSDimitry Andric Plan->setName("Initial VPlan");
88770b57cec5SDimitry Andric
8878fe013be4SDimitry Andric // Replace VPValues for known constant strides guaranteed by predicate scalar
8879fe013be4SDimitry Andric // evolution.
8880fe013be4SDimitry Andric for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) {
8881fe013be4SDimitry Andric auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
8882fe013be4SDimitry Andric auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV));
8883fe013be4SDimitry Andric // Only handle constant strides for now.
8884fe013be4SDimitry Andric if (!ScevStride)
8885fe013be4SDimitry Andric continue;
8886fe013be4SDimitry Andric Constant *CI = ConstantInt::get(Stride->getType(), ScevStride->getAPInt());
8887fe013be4SDimitry Andric
8888fe013be4SDimitry Andric auto *ConstVPV = Plan->getVPValueOrAddLiveIn(CI);
8889fe013be4SDimitry Andric // The versioned value may not be used in the loop directly, so just add a
8890fe013be4SDimitry Andric // new live-in in those cases.
8891fe013be4SDimitry Andric Plan->getVPValueOrAddLiveIn(StrideV)->replaceAllUsesWith(ConstVPV);
8892fe013be4SDimitry Andric }
8893fe013be4SDimitry Andric
889481ad6265SDimitry Andric // From this point onwards, VPlan-to-VPlan transformations may change the plan
889581ad6265SDimitry Andric // in ways that accessing values using original IR values is incorrect.
889681ad6265SDimitry Andric Plan->disableValue2VPValue();
889781ad6265SDimitry Andric
8898fe013be4SDimitry Andric // Sink users of fixed-order recurrence past the recipe defining the previous
8899fe013be4SDimitry Andric // value and introduce FirstOrderRecurrenceSplice VPInstructions.
8900fe013be4SDimitry Andric if (!VPlanTransforms::adjustFixedOrderRecurrences(*Plan, Builder))
8901c9157d92SDimitry Andric return nullptr;
8902fe013be4SDimitry Andric
8903c9157d92SDimitry Andric if (useActiveLaneMask(Style)) {
8904c9157d92SDimitry Andric // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
8905c9157d92SDimitry Andric // TailFoldingStyle is visible there.
8906c9157d92SDimitry Andric bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
8907c9157d92SDimitry Andric bool WithoutRuntimeCheck =
8908c9157d92SDimitry Andric Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
8909c9157d92SDimitry Andric VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
8910c9157d92SDimitry Andric WithoutRuntimeCheck);
8911c9157d92SDimitry Andric }
8912c9157d92SDimitry Andric return Plan;
89130b57cec5SDimitry Andric }
89140b57cec5SDimitry Andric
buildVPlan(VFRange & Range)89150b57cec5SDimitry Andric VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
89160b57cec5SDimitry Andric // Outer loop handling: They may require CFG and instruction level
89170b57cec5SDimitry Andric // transformations before even evaluating whether vectorization is profitable.
89180b57cec5SDimitry Andric // Since we cannot modify the incoming IR, we need to build VPlan upfront in
89190b57cec5SDimitry Andric // the vectorization pipeline.
8920e8d8bef9SDimitry Andric assert(!OrigLoop->isInnermost());
89210b57cec5SDimitry Andric assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
89220b57cec5SDimitry Andric
89230b57cec5SDimitry Andric // Create new empty VPlan
8924fe013be4SDimitry Andric auto Plan = VPlan::createInitialVPlan(
8925fe013be4SDimitry Andric createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
8926fe013be4SDimitry Andric *PSE.getSE());
89270b57cec5SDimitry Andric
89280b57cec5SDimitry Andric // Build hierarchical CFG
89290b57cec5SDimitry Andric VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
89300b57cec5SDimitry Andric HCFGBuilder.buildHierarchicalCFG();
89310b57cec5SDimitry Andric
8932fe013be4SDimitry Andric for (ElementCount VF : Range)
89330b57cec5SDimitry Andric Plan->addVF(VF);
89340b57cec5SDimitry Andric
89350eae32dcSDimitry Andric VPlanTransforms::VPInstructionsToVPRecipes(
8936fe013be4SDimitry Andric Plan,
89370eae32dcSDimitry Andric [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
8938fe013be4SDimitry Andric *PSE.getSE(), *TLI);
893904eeddc0SDimitry Andric
894081ad6265SDimitry Andric // Remove the existing terminator of the exiting block of the top-most region.
894181ad6265SDimitry Andric // A BranchOnCount will be added instead when adding the canonical IV recipes.
894281ad6265SDimitry Andric auto *Term =
894381ad6265SDimitry Andric Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
894481ad6265SDimitry Andric Term->eraseFromParent();
894581ad6265SDimitry Andric
8946c9157d92SDimitry Andric // Tail folding is not supported for outer loops, so the induction increment
8947c9157d92SDimitry Andric // is guaranteed to not wrap.
8948c9157d92SDimitry Andric bool HasNUW = true;
8949c9157d92SDimitry Andric addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
8950c9157d92SDimitry Andric DebugLoc());
89510b57cec5SDimitry Andric return Plan;
89520b57cec5SDimitry Andric }
89530b57cec5SDimitry Andric
8954349cc55cSDimitry Andric // Adjust the recipes for reductions. For in-loop reductions the chain of
8955349cc55cSDimitry Andric // instructions leading from the loop exit instr to the phi need to be converted
8956349cc55cSDimitry Andric // to reductions, with one operand being vector and the other being the scalar
8957349cc55cSDimitry Andric // reduction chain. For other reductions, a select is introduced between the phi
8958349cc55cSDimitry Andric // and live-out recipes when folding the tail.
8959cdc20ff6SDimitry Andric //
8960cdc20ff6SDimitry Andric // A ComputeReductionResult recipe is added to the middle block, also for
8961cdc20ff6SDimitry Andric // in-loop reductions which compute their result in-loop, because generating
8962cdc20ff6SDimitry Andric // the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
adjustRecipesForReductions(VPBasicBlock * LatchVPBB,VPlanPtr & Plan,VPRecipeBuilder & RecipeBuilder,ElementCount MinVF)8963349cc55cSDimitry Andric void LoopVectorizationPlanner::adjustRecipesForReductions(
8964349cc55cSDimitry Andric VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
8965349cc55cSDimitry Andric ElementCount MinVF) {
8966cdc20ff6SDimitry Andric VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
8967cdc20ff6SDimitry Andric VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
8968c9157d92SDimitry Andric // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
8969c9157d92SDimitry Andric // sank outside of the loop would keep the same order as they had in the
8970c9157d92SDimitry Andric // original loop.
8971c9157d92SDimitry Andric SmallVector<VPReductionPHIRecipe *> ReductionPHIList;
8972c9157d92SDimitry Andric for (VPRecipeBase &R : Header->phis()) {
8973c9157d92SDimitry Andric if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
8974c9157d92SDimitry Andric ReductionPHIList.emplace_back(ReductionPhi);
8975c9157d92SDimitry Andric }
8976c9157d92SDimitry Andric bool HasIntermediateStore = false;
8977c9157d92SDimitry Andric stable_sort(ReductionPHIList,
8978c9157d92SDimitry Andric [this, &HasIntermediateStore](const VPReductionPHIRecipe *R1,
8979c9157d92SDimitry Andric const VPReductionPHIRecipe *R2) {
8980c9157d92SDimitry Andric auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore;
8981c9157d92SDimitry Andric auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore;
8982c9157d92SDimitry Andric HasIntermediateStore |= IS1 || IS2;
8983e8d8bef9SDimitry Andric
8984c9157d92SDimitry Andric // If neither of the recipes has an intermediate store, keep the
8985c9157d92SDimitry Andric // order the same.
8986c9157d92SDimitry Andric if (!IS1 && !IS2)
8987c9157d92SDimitry Andric return false;
8988c9157d92SDimitry Andric
8989c9157d92SDimitry Andric // If only one of the recipes has an intermediate store, then
8990c9157d92SDimitry Andric // move it towards the beginning of the list.
8991c9157d92SDimitry Andric if (IS1 && !IS2)
8992c9157d92SDimitry Andric return true;
8993c9157d92SDimitry Andric
8994c9157d92SDimitry Andric if (!IS1 && IS2)
8995c9157d92SDimitry Andric return false;
8996c9157d92SDimitry Andric
8997c9157d92SDimitry Andric // If both recipes have an intermediate store, then the recipe
8998c9157d92SDimitry Andric // with the later store should be processed earlier. So it
8999c9157d92SDimitry Andric // should go to the beginning of the list.
9000c9157d92SDimitry Andric return DT->dominates(IS2, IS1);
9001c9157d92SDimitry Andric });
9002c9157d92SDimitry Andric
9003c9157d92SDimitry Andric if (HasIntermediateStore && ReductionPHIList.size() > 1)
9004c9157d92SDimitry Andric for (VPRecipeBase *R : ReductionPHIList)
9005c9157d92SDimitry Andric R->moveBefore(*Header, Header->getFirstNonPhi());
9006c9157d92SDimitry Andric
9007c9157d92SDimitry Andric for (VPRecipeBase &R : Header->phis()) {
9008c9157d92SDimitry Andric auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9009c9157d92SDimitry Andric if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
9010fe6060f1SDimitry Andric continue;
9011fe6060f1SDimitry Andric
9012c9157d92SDimitry Andric const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9013e8d8bef9SDimitry Andric RecurKind Kind = RdxDesc.getRecurrenceKind();
9014c9157d92SDimitry Andric assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
9015c9157d92SDimitry Andric "AnyOf reductions are not allowed for in-loop reductions");
9016e8d8bef9SDimitry Andric
9017c9157d92SDimitry Andric // Collect the chain of "link" recipes for the reduction starting at PhiR.
9018a58f00eaSDimitry Andric SetVector<VPSingleDefRecipe *> Worklist;
9019c9157d92SDimitry Andric Worklist.insert(PhiR);
9020c9157d92SDimitry Andric for (unsigned I = 0; I != Worklist.size(); ++I) {
9021a58f00eaSDimitry Andric VPSingleDefRecipe *Cur = Worklist[I];
9022a58f00eaSDimitry Andric for (VPUser *U : Cur->users()) {
9023a58f00eaSDimitry Andric auto *UserRecipe = dyn_cast<VPSingleDefRecipe>(U);
9024a58f00eaSDimitry Andric if (!UserRecipe) {
9025a58f00eaSDimitry Andric assert(isa<VPLiveOut>(U) &&
9026a58f00eaSDimitry Andric "U must either be a VPSingleDef or VPLiveOut");
9027c9157d92SDimitry Andric continue;
9028a58f00eaSDimitry Andric }
9029c9157d92SDimitry Andric Worklist.insert(UserRecipe);
9030c9157d92SDimitry Andric }
9031c9157d92SDimitry Andric }
9032c9157d92SDimitry Andric
9033c9157d92SDimitry Andric // Visit operation "Links" along the reduction chain top-down starting from
9034c9157d92SDimitry Andric // the phi until LoopExitValue. We keep track of the previous item
9035c9157d92SDimitry Andric // (PreviousLink) to tell which of the two operands of a Link will remain
9036c9157d92SDimitry Andric // scalar and which will be reduced. For minmax by select(cmp), Link will be
9037c9157d92SDimitry Andric // the select instructions.
9038a58f00eaSDimitry Andric VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0].
9039a58f00eaSDimitry Andric for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) {
9040c9157d92SDimitry Andric Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
9041c9157d92SDimitry Andric
9042c9157d92SDimitry Andric // Index of the first operand which holds a non-mask vector operand.
9043c9157d92SDimitry Andric unsigned IndexOfFirstOperand;
90444824e7fdSDimitry Andric // Recognize a call to the llvm.fmuladd intrinsic.
90454824e7fdSDimitry Andric bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9046c9157d92SDimitry Andric VPValue *VecOp;
9047c9157d92SDimitry Andric VPBasicBlock *LinkVPBB = CurrentLink->getParent();
90484824e7fdSDimitry Andric if (IsFMulAdd) {
9049c9157d92SDimitry Andric assert(
9050c9157d92SDimitry Andric RecurrenceDescriptor::isFMulAddIntrinsic(CurrentLinkI) &&
9051c9157d92SDimitry Andric "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9052c9157d92SDimitry Andric assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) ||
9053c9157d92SDimitry Andric isa<VPWidenCallRecipe>(CurrentLink)) &&
9054a58f00eaSDimitry Andric CurrentLink->getOperand(2) == PreviousLink &&
9055c9157d92SDimitry Andric "expected a call where the previous link is the added operand");
9056c9157d92SDimitry Andric
90574824e7fdSDimitry Andric // If the instruction is a call to the llvm.fmuladd intrinsic then we
9058c9157d92SDimitry Andric // need to create an fmul recipe (multiplying the first two operands of
9059c9157d92SDimitry Andric // the fmuladd together) to use as the vector operand for the fadd
9060c9157d92SDimitry Andric // reduction.
90614824e7fdSDimitry Andric VPInstruction *FMulRecipe = new VPInstruction(
9062c9157d92SDimitry Andric Instruction::FMul,
9063c9157d92SDimitry Andric {CurrentLink->getOperand(0), CurrentLink->getOperand(1)},
9064c9157d92SDimitry Andric CurrentLinkI->getFastMathFlags());
9065c9157d92SDimitry Andric LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());
90664824e7fdSDimitry Andric VecOp = FMulRecipe;
9067c9157d92SDimitry Andric } else {
9068c9157d92SDimitry Andric if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9069c9157d92SDimitry Andric if (isa<VPWidenRecipe>(CurrentLink)) {
9070c9157d92SDimitry Andric assert(isa<CmpInst>(CurrentLinkI) &&
9071c9157d92SDimitry Andric "need to have the compare of the select");
9072c9157d92SDimitry Andric continue;
90734824e7fdSDimitry Andric }
9074c9157d92SDimitry Andric assert(isa<VPWidenSelectRecipe>(CurrentLink) &&
9075c9157d92SDimitry Andric "must be a select recipe");
9076c9157d92SDimitry Andric IndexOfFirstOperand = 1;
9077c9157d92SDimitry Andric } else {
9078c9157d92SDimitry Andric assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) &&
9079c9157d92SDimitry Andric "Expected to replace a VPWidenSC");
9080c9157d92SDimitry Andric IndexOfFirstOperand = 0;
9081c9157d92SDimitry Andric }
9082c9157d92SDimitry Andric // Note that for non-commutable operands (cmp-selects), the semantics of
9083c9157d92SDimitry Andric // the cmp-select are captured in the recurrence kind.
9084c9157d92SDimitry Andric unsigned VecOpId =
9085a58f00eaSDimitry Andric CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLink
9086c9157d92SDimitry Andric ? IndexOfFirstOperand + 1
9087c9157d92SDimitry Andric : IndexOfFirstOperand;
9088c9157d92SDimitry Andric VecOp = CurrentLink->getOperand(VecOpId);
9089a58f00eaSDimitry Andric assert(VecOp != PreviousLink &&
9090c9157d92SDimitry Andric CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 -
9091c9157d92SDimitry Andric (VecOpId - IndexOfFirstOperand)) ==
9092a58f00eaSDimitry Andric PreviousLink &&
9093a58f00eaSDimitry Andric "PreviousLink must be the operand other than VecOp");
9094c9157d92SDimitry Andric }
9095c9157d92SDimitry Andric
9096c9157d92SDimitry Andric BasicBlock *BB = CurrentLinkI->getParent();
9097c9157d92SDimitry Andric VPValue *CondOp = nullptr;
9098c9157d92SDimitry Andric if (CM.blockNeedsPredicationForAnyReason(BB)) {
9099c9157d92SDimitry Andric VPBuilder::InsertPointGuard Guard(Builder);
9100c9157d92SDimitry Andric Builder.setInsertPoint(CurrentLink);
9101cdc20ff6SDimitry Andric CondOp = RecipeBuilder.getBlockInMask(BB);
9102c9157d92SDimitry Andric }
9103c9157d92SDimitry Andric
9104c9157d92SDimitry Andric VPReductionRecipe *RedRecipe = new VPReductionRecipe(
9105a58f00eaSDimitry Andric RdxDesc, CurrentLinkI, PreviousLink, VecOp, CondOp);
9106753f127fSDimitry Andric // Append the recipe to the end of the VPBasicBlock because we need to
9107753f127fSDimitry Andric // ensure that it comes after all of it's inputs, including CondOp.
9108c9157d92SDimitry Andric // Note that this transformation may leave over dead recipes (including
9109c9157d92SDimitry Andric // CurrentLink), which will be cleaned by a later VPlan transform.
9110c9157d92SDimitry Andric LinkVPBB->appendRecipe(RedRecipe);
9111a58f00eaSDimitry Andric CurrentLink->replaceAllUsesWith(RedRecipe);
9112c9157d92SDimitry Andric PreviousLink = RedRecipe;
9113e8d8bef9SDimitry Andric }
9114e8d8bef9SDimitry Andric }
9115c9157d92SDimitry Andric Builder.setInsertPoint(&*LatchVPBB->begin());
911681ad6265SDimitry Andric for (VPRecipeBase &R :
911781ad6265SDimitry Andric Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9118349cc55cSDimitry Andric VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9119cdc20ff6SDimitry Andric if (!PhiR)
9120349cc55cSDimitry Andric continue;
9121c9157d92SDimitry Andric
9122c9157d92SDimitry Andric const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9123c9157d92SDimitry Andric // If tail is folded by masking, introduce selects between the phi
9124c9157d92SDimitry Andric // and the live-out instruction of each reduction, at the beginning of the
9125c9157d92SDimitry Andric // dedicated latch block.
9126cdc20ff6SDimitry Andric auto *OrigExitingVPV = PhiR->getBackedgeValue();
9127cdc20ff6SDimitry Andric auto *NewExitingVPV = PhiR->getBackedgeValue();
9128cdc20ff6SDimitry Andric if (!PhiR->isInLoop() && CM.foldTailByMasking()) {
9129cdc20ff6SDimitry Andric VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader());
9130cdc20ff6SDimitry Andric assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB &&
913104eeddc0SDimitry Andric "reduction recipe must be defined before latch");
9132c9157d92SDimitry Andric Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType();
9133cdc20ff6SDimitry Andric std::optional<FastMathFlags> FMFs =
9134c9157d92SDimitry Andric PhiTy->isFloatingPointTy()
9135cdc20ff6SDimitry Andric ? std::make_optional(RdxDesc.getFastMathFlags())
9136cdc20ff6SDimitry Andric : std::nullopt;
9137cdc20ff6SDimitry Andric NewExitingVPV =
9138cdc20ff6SDimitry Andric Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs);
9139cdc20ff6SDimitry Andric OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) {
9140cdc20ff6SDimitry Andric return isa<VPInstruction>(&U) &&
9141cdc20ff6SDimitry Andric cast<VPInstruction>(&U)->getOpcode() ==
9142cdc20ff6SDimitry Andric VPInstruction::ComputeReductionResult;
9143cdc20ff6SDimitry Andric });
9144c9157d92SDimitry Andric if (PreferPredicatedReductionSelect ||
9145c9157d92SDimitry Andric TTI.preferPredicatedReductionSelect(
9146c9157d92SDimitry Andric PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy,
9147c9157d92SDimitry Andric TargetTransformInfo::ReductionFlags()))
9148cdc20ff6SDimitry Andric PhiR->setOperand(1, NewExitingVPV);
9149c9157d92SDimitry Andric }
9150cdc20ff6SDimitry Andric
9151c9157d92SDimitry Andric // If the vector reduction can be performed in a smaller type, we truncate
9152c9157d92SDimitry Andric // then extend the loop exit value to enable InstCombine to evaluate the
9153c9157d92SDimitry Andric // entire expression in the smaller type.
9154c9157d92SDimitry Andric Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType();
9155c9157d92SDimitry Andric if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
9156c9157d92SDimitry Andric assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
9157c9157d92SDimitry Andric Type *RdxTy = RdxDesc.getRecurrenceType();
9158cdc20ff6SDimitry Andric auto *Trunc =
9159cdc20ff6SDimitry Andric new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy);
9160c9157d92SDimitry Andric auto *Extnd =
9161c9157d92SDimitry Andric RdxDesc.isSigned()
9162c9157d92SDimitry Andric ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy)
9163c9157d92SDimitry Andric : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy);
9164c9157d92SDimitry Andric
9165cdc20ff6SDimitry Andric Trunc->insertAfter(NewExitingVPV->getDefiningRecipe());
9166c9157d92SDimitry Andric Extnd->insertAfter(Trunc);
9167cdc20ff6SDimitry Andric if (PhiR->getOperand(1) == NewExitingVPV)
9168cdc20ff6SDimitry Andric PhiR->setOperand(1, Extnd->getVPSingleValue());
9169cdc20ff6SDimitry Andric NewExitingVPV = Extnd;
9170349cc55cSDimitry Andric }
9171cdc20ff6SDimitry Andric
9172cdc20ff6SDimitry Andric // We want code in the middle block to appear to execute on the location of
9173cdc20ff6SDimitry Andric // the scalar loop's latch terminator because: (a) it is all compiler
9174cdc20ff6SDimitry Andric // generated, (b) these instructions are always executed after evaluating
9175cdc20ff6SDimitry Andric // the latch conditional branch, and (c) other passes may add new
9176cdc20ff6SDimitry Andric // predecessors which terminate on this line. This is the easiest way to
9177cdc20ff6SDimitry Andric // ensure we don't accidentally cause an extra step back into the loop while
9178cdc20ff6SDimitry Andric // debugging.
9179cdc20ff6SDimitry Andric DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
9180cdc20ff6SDimitry Andric
9181cdc20ff6SDimitry Andric // TODO: At the moment ComputeReductionResult also drives creation of the
9182cdc20ff6SDimitry Andric // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
9183cdc20ff6SDimitry Andric // even for in-loop reductions, until the reduction resume value handling is
9184cdc20ff6SDimitry Andric // also modeled in VPlan.
9185cdc20ff6SDimitry Andric auto *FinalReductionResult = new VPInstruction(
9186cdc20ff6SDimitry Andric VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
9187cdc20ff6SDimitry Andric cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor())
9188cdc20ff6SDimitry Andric ->appendRecipe(FinalReductionResult);
9189cdc20ff6SDimitry Andric OrigExitingVPV->replaceUsesWithIf(
9190cdc20ff6SDimitry Andric FinalReductionResult,
9191cdc20ff6SDimitry Andric [](VPUser &User, unsigned) { return isa<VPLiveOut>(&User); });
9192349cc55cSDimitry Andric }
9193fe013be4SDimitry Andric
9194fe013be4SDimitry Andric VPlanTransforms::clearReductionWrapFlags(*Plan);
9195e8d8bef9SDimitry Andric }
9196e8d8bef9SDimitry Andric
9197fe6060f1SDimitry Andric #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const91985ffd83dbSDimitry Andric void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
91995ffd83dbSDimitry Andric VPSlotTracker &SlotTracker) const {
9200fe6060f1SDimitry Andric O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
92010b57cec5SDimitry Andric IG->getInsertPos()->printAsOperand(O, false);
92020b57cec5SDimitry Andric O << ", ";
92035ffd83dbSDimitry Andric getAddr()->printAsOperand(O, SlotTracker);
9204480093f4SDimitry Andric VPValue *Mask = getMask();
9205480093f4SDimitry Andric if (Mask) {
9206480093f4SDimitry Andric O << ", ";
92075ffd83dbSDimitry Andric Mask->printAsOperand(O, SlotTracker);
92080b57cec5SDimitry Andric }
9209349cc55cSDimitry Andric
9210349cc55cSDimitry Andric unsigned OpIdx = 0;
9211349cc55cSDimitry Andric for (unsigned i = 0; i < IG->getFactor(); ++i) {
9212349cc55cSDimitry Andric if (!IG->getMember(i))
9213349cc55cSDimitry Andric continue;
9214349cc55cSDimitry Andric if (getNumStoreOperands() > 0) {
9215349cc55cSDimitry Andric O << "\n" << Indent << " store ";
9216349cc55cSDimitry Andric getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
9217349cc55cSDimitry Andric O << " to index " << i;
9218349cc55cSDimitry Andric } else {
9219349cc55cSDimitry Andric O << "\n" << Indent << " ";
9220349cc55cSDimitry Andric getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
9221349cc55cSDimitry Andric O << " = load from index " << i;
9222349cc55cSDimitry Andric }
9223349cc55cSDimitry Andric ++OpIdx;
9224349cc55cSDimitry Andric }
92255ffd83dbSDimitry Andric }
9226fe6060f1SDimitry Andric #endif
92275ffd83dbSDimitry Andric
execute(VPTransformState & State)922881ad6265SDimitry Andric void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
922981ad6265SDimitry Andric assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&
923081ad6265SDimitry Andric "Not a pointer induction according to InductionDescriptor!");
923181ad6265SDimitry Andric assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
923281ad6265SDimitry Andric "Unexpected type.");
923381ad6265SDimitry Andric
923481ad6265SDimitry Andric auto *IVR = getParent()->getPlan()->getCanonicalIV();
923581ad6265SDimitry Andric PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0));
923681ad6265SDimitry Andric
923781ad6265SDimitry Andric if (onlyScalarsGenerated(State.VF)) {
923881ad6265SDimitry Andric // This is the normalized GEP that starts counting at zero.
923981ad6265SDimitry Andric Value *PtrInd = State.Builder.CreateSExtOrTrunc(
924081ad6265SDimitry Andric CanonicalIV, IndDesc.getStep()->getType());
924181ad6265SDimitry Andric // Determine the number of scalars we need to generate for each unroll
924281ad6265SDimitry Andric // iteration. If the instruction is uniform, we only need to generate the
924381ad6265SDimitry Andric // first lane. Otherwise, we generate all VF values.
924481ad6265SDimitry Andric bool IsUniform = vputils::onlyFirstLaneUsed(this);
924581ad6265SDimitry Andric assert((IsUniform || !State.VF.isScalable()) &&
924681ad6265SDimitry Andric "Cannot scalarize a scalable VF");
924781ad6265SDimitry Andric unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
924881ad6265SDimitry Andric
924981ad6265SDimitry Andric for (unsigned Part = 0; Part < State.UF; ++Part) {
925081ad6265SDimitry Andric Value *PartStart =
925181ad6265SDimitry Andric createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part);
925281ad6265SDimitry Andric
925381ad6265SDimitry Andric for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
925481ad6265SDimitry Andric Value *Idx = State.Builder.CreateAdd(
925581ad6265SDimitry Andric PartStart, ConstantInt::get(PtrInd->getType(), Lane));
925681ad6265SDimitry Andric Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx);
925781ad6265SDimitry Andric
9258fe013be4SDimitry Andric Value *Step = State.get(getOperand(1), VPIteration(Part, Lane));
925981ad6265SDimitry Andric Value *SclrGep = emitTransformedIndex(
9260c9157d92SDimitry Andric State.Builder, GlobalIdx, IndDesc.getStartValue(), Step,
9261c9157d92SDimitry Andric IndDesc.getKind(), IndDesc.getInductionBinOp());
926281ad6265SDimitry Andric SclrGep->setName("next.gep");
926381ad6265SDimitry Andric State.set(this, SclrGep, VPIteration(Part, Lane));
926481ad6265SDimitry Andric }
926581ad6265SDimitry Andric }
926681ad6265SDimitry Andric return;
926781ad6265SDimitry Andric }
926881ad6265SDimitry Andric
926981ad6265SDimitry Andric Type *PhiType = IndDesc.getStep()->getType();
927081ad6265SDimitry Andric
927181ad6265SDimitry Andric // Build a pointer phi
927281ad6265SDimitry Andric Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
927381ad6265SDimitry Andric Type *ScStValueType = ScalarStartValue->getType();
927481ad6265SDimitry Andric PHINode *NewPointerPhi =
927581ad6265SDimitry Andric PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);
927681ad6265SDimitry Andric
927781ad6265SDimitry Andric BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
927881ad6265SDimitry Andric NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
927981ad6265SDimitry Andric
928081ad6265SDimitry Andric // A pointer induction, performed by using a gep
928181ad6265SDimitry Andric Instruction *InductionLoc = &*State.Builder.GetInsertPoint();
928281ad6265SDimitry Andric
9283bdd1243dSDimitry Andric Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0));
928481ad6265SDimitry Andric Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
928581ad6265SDimitry Andric Value *NumUnrolledElems =
928681ad6265SDimitry Andric State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
928781ad6265SDimitry Andric Value *InductionGEP = GetElementPtrInst::Create(
9288fe013be4SDimitry Andric State.Builder.getInt8Ty(), NewPointerPhi,
928981ad6265SDimitry Andric State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
929081ad6265SDimitry Andric InductionLoc);
929181ad6265SDimitry Andric // Add induction update using an incorrect block temporarily. The phi node
929281ad6265SDimitry Andric // will be fixed after VPlan execution. Note that at this point the latch
929381ad6265SDimitry Andric // block cannot be used, as it does not exist yet.
929481ad6265SDimitry Andric // TODO: Model increment value in VPlan, by turning the recipe into a
929581ad6265SDimitry Andric // multi-def and a subclass of VPHeaderPHIRecipe.
929681ad6265SDimitry Andric NewPointerPhi->addIncoming(InductionGEP, VectorPH);
929781ad6265SDimitry Andric
929881ad6265SDimitry Andric // Create UF many actual address geps that use the pointer
929981ad6265SDimitry Andric // phi as base and a vectorized version of the step value
930081ad6265SDimitry Andric // (<step*0, ..., step*N>) as offset.
930181ad6265SDimitry Andric for (unsigned Part = 0; Part < State.UF; ++Part) {
930281ad6265SDimitry Andric Type *VecPhiType = VectorType::get(PhiType, State.VF);
930381ad6265SDimitry Andric Value *StartOffsetScalar =
930481ad6265SDimitry Andric State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
930581ad6265SDimitry Andric Value *StartOffset =
930681ad6265SDimitry Andric State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
930781ad6265SDimitry Andric // Create a vector of consecutive numbers from zero to VF.
930881ad6265SDimitry Andric StartOffset = State.Builder.CreateAdd(
930981ad6265SDimitry Andric StartOffset, State.Builder.CreateStepVector(VecPhiType));
931081ad6265SDimitry Andric
9311fe013be4SDimitry Andric assert(ScalarStepValue == State.get(getOperand(1), VPIteration(Part, 0)) &&
9312bdd1243dSDimitry Andric "scalar step must be the same across all parts");
931381ad6265SDimitry Andric Value *GEP = State.Builder.CreateGEP(
9314fe013be4SDimitry Andric State.Builder.getInt8Ty(), NewPointerPhi,
931581ad6265SDimitry Andric State.Builder.CreateMul(
931681ad6265SDimitry Andric StartOffset,
931781ad6265SDimitry Andric State.Builder.CreateVectorSplat(State.VF, ScalarStepValue),
931881ad6265SDimitry Andric "vector.gep"));
931981ad6265SDimitry Andric State.set(this, GEP, Part);
932081ad6265SDimitry Andric }
932181ad6265SDimitry Andric }
932281ad6265SDimitry Andric
execute(VPTransformState & State)9323bdd1243dSDimitry Andric void VPDerivedIVRecipe::execute(VPTransformState &State) {
9324bdd1243dSDimitry Andric assert(!State.Instance && "VPDerivedIVRecipe being replicated.");
932581ad6265SDimitry Andric
932681ad6265SDimitry Andric // Fast-math-flags propagate from the original induction instruction.
932781ad6265SDimitry Andric IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9328c9157d92SDimitry Andric if (FPBinOp)
9329c9157d92SDimitry Andric State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
933081ad6265SDimitry Andric
933181ad6265SDimitry Andric Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9332bdd1243dSDimitry Andric Value *CanonicalIV = State.get(getCanonicalIV(), VPIteration(0, 0));
9333c9157d92SDimitry Andric Value *DerivedIV = emitTransformedIndex(
9334c9157d92SDimitry Andric State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step,
9335c9157d92SDimitry Andric Kind, cast_if_present<BinaryOperator>(FPBinOp));
9336bdd1243dSDimitry Andric DerivedIV->setName("offset.idx");
9337c9157d92SDimitry Andric if (TruncResultTy) {
9338c9157d92SDimitry Andric assert(TruncResultTy != DerivedIV->getType() &&
9339c9157d92SDimitry Andric Step->getType()->isIntegerTy() &&
934081ad6265SDimitry Andric "Truncation requires an integer step");
9341c9157d92SDimitry Andric DerivedIV = State.Builder.CreateTrunc(DerivedIV, TruncResultTy);
934281ad6265SDimitry Andric }
9343bdd1243dSDimitry Andric assert(DerivedIV != CanonicalIV && "IV didn't need transforming?");
934481ad6265SDimitry Andric
9345bdd1243dSDimitry Andric State.set(this, DerivedIV, VPIteration(0, 0));
934681ad6265SDimitry Andric }
934781ad6265SDimitry Andric
execute(VPTransformState & State)93480b57cec5SDimitry Andric void VPInterleaveRecipe::execute(VPTransformState &State) {
93490b57cec5SDimitry Andric assert(!State.Instance && "Interleave group being replicated.");
9350e8d8bef9SDimitry Andric State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9351fe013be4SDimitry Andric getStoredValues(), getMask(),
9352fe013be4SDimitry Andric NeedsMaskForGaps);
9353e8d8bef9SDimitry Andric }
9354e8d8bef9SDimitry Andric
execute(VPTransformState & State)9355e8d8bef9SDimitry Andric void VPReductionRecipe::execute(VPTransformState &State) {
9356e8d8bef9SDimitry Andric assert(!State.Instance && "Reduction being replicated.");
9357fe6060f1SDimitry Andric Value *PrevInChain = State.get(getChainOp(), 0);
9358c9157d92SDimitry Andric RecurKind Kind = RdxDesc.getRecurrenceKind();
9359c9157d92SDimitry Andric bool IsOrdered = State.ILV->useOrderedReductions(RdxDesc);
9360349cc55cSDimitry Andric // Propagate the fast-math flags carried by the underlying instruction.
9361349cc55cSDimitry Andric IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
9362c9157d92SDimitry Andric State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
9363349cc55cSDimitry Andric for (unsigned Part = 0; Part < State.UF; ++Part) {
9364e8d8bef9SDimitry Andric Value *NewVecOp = State.get(getVecOp(), Part);
9365e8d8bef9SDimitry Andric if (VPValue *Cond = getCondOp()) {
9366c9157d92SDimitry Andric Value *NewCond = State.VF.isVector() ? State.get(Cond, Part)
9367c9157d92SDimitry Andric : State.get(Cond, {Part, 0});
9368c9157d92SDimitry Andric VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
9369c9157d92SDimitry Andric Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
9370c9157d92SDimitry Andric Value *Iden = RdxDesc.getRecurrenceIdentity(Kind, ElementTy,
9371c9157d92SDimitry Andric RdxDesc.getFastMathFlags());
9372c9157d92SDimitry Andric if (State.VF.isVector()) {
9373c9157d92SDimitry Andric Iden =
9374349cc55cSDimitry Andric State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
9375c9157d92SDimitry Andric }
9376c9157d92SDimitry Andric
9377c9157d92SDimitry Andric Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Iden);
9378e8d8bef9SDimitry Andric NewVecOp = Select;
9379e8d8bef9SDimitry Andric }
9380fe6060f1SDimitry Andric Value *NewRed;
9381e8d8bef9SDimitry Andric Value *NextInChain;
9382fe6060f1SDimitry Andric if (IsOrdered) {
9383fe6060f1SDimitry Andric if (State.VF.isVector())
9384c9157d92SDimitry Andric NewRed = createOrderedReduction(State.Builder, RdxDesc, NewVecOp,
9385fe6060f1SDimitry Andric PrevInChain);
9386fe6060f1SDimitry Andric else
9387fe6060f1SDimitry Andric NewRed = State.Builder.CreateBinOp(
9388c9157d92SDimitry Andric (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain,
9389349cc55cSDimitry Andric NewVecOp);
9390fe6060f1SDimitry Andric PrevInChain = NewRed;
9391fe6060f1SDimitry Andric } else {
9392fe6060f1SDimitry Andric PrevInChain = State.get(getChainOp(), Part);
9393c9157d92SDimitry Andric NewRed = createTargetReduction(State.Builder, RdxDesc, NewVecOp);
9394fe6060f1SDimitry Andric }
9395e8d8bef9SDimitry Andric if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9396c9157d92SDimitry Andric NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(),
9397e8d8bef9SDimitry Andric NewRed, PrevInChain);
9398fe6060f1SDimitry Andric } else if (IsOrdered)
9399fe6060f1SDimitry Andric NextInChain = NewRed;
9400349cc55cSDimitry Andric else
9401e8d8bef9SDimitry Andric NextInChain = State.Builder.CreateBinOp(
9402c9157d92SDimitry Andric (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain);
9403fe6060f1SDimitry Andric State.set(this, NextInChain, Part);
9404e8d8bef9SDimitry Andric }
94050b57cec5SDimitry Andric }
94060b57cec5SDimitry Andric
execute(VPTransformState & State)94070b57cec5SDimitry Andric void VPReplicateRecipe::execute(VPTransformState &State) {
9408bdd1243dSDimitry Andric Instruction *UI = getUnderlyingInstr();
94090b57cec5SDimitry Andric if (State.Instance) { // Generate a single instance.
9410e8d8bef9SDimitry Andric assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9411fe013be4SDimitry Andric State.ILV->scalarizeInstruction(UI, this, *State.Instance, State);
94120b57cec5SDimitry Andric // Insert scalar instance packing it into a vector.
9413fe013be4SDimitry Andric if (State.VF.isVector() && shouldPack()) {
9414e8d8bef9SDimitry Andric // If we're constructing lane 0, initialize to start from poison.
9415fe6060f1SDimitry Andric if (State.Instance->Lane.isFirstLane()) {
9416e8d8bef9SDimitry Andric assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9417e8d8bef9SDimitry Andric Value *Poison = PoisonValue::get(
9418bdd1243dSDimitry Andric VectorType::get(UI->getType(), State.VF));
9419fe6060f1SDimitry Andric State.set(this, Poison, State.Instance->Part);
94200b57cec5SDimitry Andric }
9421c9157d92SDimitry Andric State.packScalarIntoVectorValue(this, *State.Instance);
94220b57cec5SDimitry Andric }
94230b57cec5SDimitry Andric return;
94240b57cec5SDimitry Andric }
94250b57cec5SDimitry Andric
9426fcaf7f86SDimitry Andric if (IsUniform) {
9427bdd1243dSDimitry Andric // If the recipe is uniform across all parts (instead of just per VF), only
9428bdd1243dSDimitry Andric // generate a single instance.
9429bdd1243dSDimitry Andric if ((isa<LoadInst>(UI) || isa<StoreInst>(UI)) &&
9430bdd1243dSDimitry Andric all_of(operands(), [](VPValue *Op) {
9431bdd1243dSDimitry Andric return Op->isDefinedOutsideVectorRegions();
9432bdd1243dSDimitry Andric })) {
9433fe013be4SDimitry Andric State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), State);
9434bdd1243dSDimitry Andric if (user_begin() != user_end()) {
9435bdd1243dSDimitry Andric for (unsigned Part = 1; Part < State.UF; ++Part)
9436bdd1243dSDimitry Andric State.set(this, State.get(this, VPIteration(0, 0)),
9437bdd1243dSDimitry Andric VPIteration(Part, 0));
9438bdd1243dSDimitry Andric }
9439bdd1243dSDimitry Andric return;
9440bdd1243dSDimitry Andric }
9441bdd1243dSDimitry Andric
9442fcaf7f86SDimitry Andric // Uniform within VL means we need to generate lane 0 only for each
9443fcaf7f86SDimitry Andric // unrolled copy.
9444fcaf7f86SDimitry Andric for (unsigned Part = 0; Part < State.UF; ++Part)
9445fe013be4SDimitry Andric State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0), State);
9446bdd1243dSDimitry Andric return;
9447bdd1243dSDimitry Andric }
9448bdd1243dSDimitry Andric
9449fe013be4SDimitry Andric // A store of a loop varying value to a uniform address only needs the last
9450fe013be4SDimitry Andric // copy of the store.
9451fe013be4SDimitry Andric if (isa<StoreInst>(UI) &&
9452fe013be4SDimitry Andric vputils::isUniformAfterVectorization(getOperand(1))) {
9453bdd1243dSDimitry Andric auto Lane = VPLane::getLastLaneForVF(State.VF);
9454fe013be4SDimitry Andric State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane),
9455fcaf7f86SDimitry Andric State);
9456fcaf7f86SDimitry Andric return;
9457fcaf7f86SDimitry Andric }
9458fcaf7f86SDimitry Andric
9459fcaf7f86SDimitry Andric // Generate scalar instances for all VF lanes of all UF parts.
9460fcaf7f86SDimitry Andric assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9461fcaf7f86SDimitry Andric const unsigned EndLane = State.VF.getKnownMinValue();
94620b57cec5SDimitry Andric for (unsigned Part = 0; Part < State.UF; ++Part)
94630b57cec5SDimitry Andric for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9464fe013be4SDimitry Andric State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State);
94650b57cec5SDimitry Andric }
94660b57cec5SDimitry Andric
execute(VPTransformState & State)94670b57cec5SDimitry Andric void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
9468e8d8bef9SDimitry Andric VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
94694824e7fdSDimitry Andric
94704824e7fdSDimitry Andric // Attempt to issue a wide load.
94714824e7fdSDimitry Andric LoadInst *LI = dyn_cast<LoadInst>(&Ingredient);
94724824e7fdSDimitry Andric StoreInst *SI = dyn_cast<StoreInst>(&Ingredient);
94734824e7fdSDimitry Andric
94744824e7fdSDimitry Andric assert((LI || SI) && "Invalid Load/Store instruction");
94754824e7fdSDimitry Andric assert((!SI || StoredValue) && "No stored value provided for widened store");
94764824e7fdSDimitry Andric assert((!LI || !StoredValue) && "Stored value provided for widened load");
94774824e7fdSDimitry Andric
94784824e7fdSDimitry Andric Type *ScalarDataTy = getLoadStoreType(&Ingredient);
94794824e7fdSDimitry Andric
94804824e7fdSDimitry Andric auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
94814824e7fdSDimitry Andric const Align Alignment = getLoadStoreAlignment(&Ingredient);
9482fe013be4SDimitry Andric bool CreateGatherScatter = !isConsecutive();
94834824e7fdSDimitry Andric
94844824e7fdSDimitry Andric auto &Builder = State.Builder;
94854824e7fdSDimitry Andric InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);
94864824e7fdSDimitry Andric bool isMaskRequired = getMask();
9487c9157d92SDimitry Andric if (isMaskRequired) {
9488de8261c4SDimitry Andric // Mask reversal is only needed for non-all-one (null) masks, as reverse of
9489de8261c4SDimitry Andric // a null all-one mask is a null mask.
9490c9157d92SDimitry Andric for (unsigned Part = 0; Part < State.UF; ++Part) {
9491c9157d92SDimitry Andric Value *Mask = State.get(getMask(), Part);
9492c9157d92SDimitry Andric if (isReverse())
9493c9157d92SDimitry Andric Mask = Builder.CreateVectorReverse(Mask, "reverse");
9494c9157d92SDimitry Andric BlockInMaskParts[Part] = Mask;
9495c9157d92SDimitry Andric }
9496c9157d92SDimitry Andric }
94974824e7fdSDimitry Andric
94984824e7fdSDimitry Andric // Handle Stores:
94994824e7fdSDimitry Andric if (SI) {
9500c9157d92SDimitry Andric State.setDebugLocFrom(SI->getDebugLoc());
95014824e7fdSDimitry Andric
95024824e7fdSDimitry Andric for (unsigned Part = 0; Part < State.UF; ++Part) {
95034824e7fdSDimitry Andric Instruction *NewSI = nullptr;
95044824e7fdSDimitry Andric Value *StoredVal = State.get(StoredValue, Part);
95054824e7fdSDimitry Andric if (CreateGatherScatter) {
95064824e7fdSDimitry Andric Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
95074824e7fdSDimitry Andric Value *VectorGep = State.get(getAddr(), Part);
95084824e7fdSDimitry Andric NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
95094824e7fdSDimitry Andric MaskPart);
95104824e7fdSDimitry Andric } else {
9511fe013be4SDimitry Andric if (isReverse()) {
95124824e7fdSDimitry Andric // If we store to reverse consecutive memory locations, then we need
95134824e7fdSDimitry Andric // to reverse the order of elements in the stored value.
95144824e7fdSDimitry Andric StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
95154824e7fdSDimitry Andric // We don't want to update the value in the map as it might be used in
95164824e7fdSDimitry Andric // another expression. So don't call resetVectorValue(StoredVal).
95174824e7fdSDimitry Andric }
9518de8261c4SDimitry Andric auto *VecPtr = State.get(getAddr(), Part);
95194824e7fdSDimitry Andric if (isMaskRequired)
95204824e7fdSDimitry Andric NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
95214824e7fdSDimitry Andric BlockInMaskParts[Part]);
95224824e7fdSDimitry Andric else
95234824e7fdSDimitry Andric NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
95244824e7fdSDimitry Andric }
952581ad6265SDimitry Andric State.addMetadata(NewSI, SI);
95264824e7fdSDimitry Andric }
95274824e7fdSDimitry Andric return;
95284824e7fdSDimitry Andric }
95294824e7fdSDimitry Andric
95304824e7fdSDimitry Andric // Handle loads.
95314824e7fdSDimitry Andric assert(LI && "Must have a load instruction");
9532c9157d92SDimitry Andric State.setDebugLocFrom(LI->getDebugLoc());
95334824e7fdSDimitry Andric for (unsigned Part = 0; Part < State.UF; ++Part) {
95344824e7fdSDimitry Andric Value *NewLI;
95354824e7fdSDimitry Andric if (CreateGatherScatter) {
95364824e7fdSDimitry Andric Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
95374824e7fdSDimitry Andric Value *VectorGep = State.get(getAddr(), Part);
95384824e7fdSDimitry Andric NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
95394824e7fdSDimitry Andric nullptr, "wide.masked.gather");
954081ad6265SDimitry Andric State.addMetadata(NewLI, LI);
95414824e7fdSDimitry Andric } else {
9542de8261c4SDimitry Andric auto *VecPtr = State.get(getAddr(), Part);
95434824e7fdSDimitry Andric if (isMaskRequired)
95444824e7fdSDimitry Andric NewLI = Builder.CreateMaskedLoad(
95454824e7fdSDimitry Andric DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
95464824e7fdSDimitry Andric PoisonValue::get(DataTy), "wide.masked.load");
95474824e7fdSDimitry Andric else
95484824e7fdSDimitry Andric NewLI =
95494824e7fdSDimitry Andric Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
95504824e7fdSDimitry Andric
95514824e7fdSDimitry Andric // Add metadata to the load, but setVectorValue to the reverse shuffle.
955281ad6265SDimitry Andric State.addMetadata(NewLI, LI);
95534824e7fdSDimitry Andric if (Reverse)
95544824e7fdSDimitry Andric NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
95554824e7fdSDimitry Andric }
95564824e7fdSDimitry Andric
955781ad6265SDimitry Andric State.set(getVPSingleValue(), NewLI, Part);
95584824e7fdSDimitry Andric }
95590b57cec5SDimitry Andric }
95600b57cec5SDimitry Andric
9561480093f4SDimitry Andric // Determine how to lower the scalar epilogue, which depends on 1) optimising
9562480093f4SDimitry Andric // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9563480093f4SDimitry Andric // predication, and 4) a TTI hook that analyses whether the loop is suitable
9564480093f4SDimitry Andric // for predication.
getScalarEpilogueLowering(Function * F,Loop * L,LoopVectorizeHints & Hints,ProfileSummaryInfo * PSI,BlockFrequencyInfo * BFI,TargetTransformInfo * TTI,TargetLibraryInfo * TLI,LoopVectorizationLegality & LVL,InterleavedAccessInfo * IAI)9565480093f4SDimitry Andric static ScalarEpilogueLowering getScalarEpilogueLowering(
9566480093f4SDimitry Andric Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
9567480093f4SDimitry Andric BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
9568bdd1243dSDimitry Andric LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) {
9569480093f4SDimitry Andric // 1) OptSize takes precedence over all other options, i.e. if this is set,
9570480093f4SDimitry Andric // don't look at hints or options, and don't request a scalar epilogue.
9571e8d8bef9SDimitry Andric // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9572e8d8bef9SDimitry Andric // LoopAccessInfo (due to code dependency and not being able to reliably get
9573e8d8bef9SDimitry Andric // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9574e8d8bef9SDimitry Andric // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9575e8d8bef9SDimitry Andric // versioning when the vectorization is forced, unlike hasOptSize. So revert
9576e8d8bef9SDimitry Andric // back to the old way and vectorize with versioning when forced. See D81345.)
9577e8d8bef9SDimitry Andric if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9578e8d8bef9SDimitry Andric PGSOQueryType::IRPass) &&
9579e8d8bef9SDimitry Andric Hints.getForce() != LoopVectorizeHints::FK_Enabled))
9580480093f4SDimitry Andric return CM_ScalarEpilogueNotAllowedOptSize;
95818bcb0991SDimitry Andric
9582e8d8bef9SDimitry Andric // 2) If set, obey the directives
9583e8d8bef9SDimitry Andric if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9584e8d8bef9SDimitry Andric switch (PreferPredicateOverEpilogue) {
9585e8d8bef9SDimitry Andric case PreferPredicateTy::ScalarEpilogue:
9586480093f4SDimitry Andric return CM_ScalarEpilogueAllowed;
9587e8d8bef9SDimitry Andric case PreferPredicateTy::PredicateElseScalarEpilogue:
9588e8d8bef9SDimitry Andric return CM_ScalarEpilogueNotNeededUsePredicate;
9589e8d8bef9SDimitry Andric case PreferPredicateTy::PredicateOrDontVectorize:
9590e8d8bef9SDimitry Andric return CM_ScalarEpilogueNotAllowedUsePredicate;
9591e8d8bef9SDimitry Andric };
9592e8d8bef9SDimitry Andric }
9593480093f4SDimitry Andric
9594e8d8bef9SDimitry Andric // 3) If set, obey the hints
9595e8d8bef9SDimitry Andric switch (Hints.getPredicate()) {
9596e8d8bef9SDimitry Andric case LoopVectorizeHints::FK_Enabled:
9597e8d8bef9SDimitry Andric return CM_ScalarEpilogueNotNeededUsePredicate;
9598e8d8bef9SDimitry Andric case LoopVectorizeHints::FK_Disabled:
9599e8d8bef9SDimitry Andric return CM_ScalarEpilogueAllowed;
9600e8d8bef9SDimitry Andric };
9601e8d8bef9SDimitry Andric
9602e8d8bef9SDimitry Andric // 4) if the TTI hook indicates this is profitable, request predication.
9603fe013be4SDimitry Andric TailFoldingInfo TFI(TLI, &LVL, IAI);
9604fe013be4SDimitry Andric if (TTI->preferPredicateOverEpilogue(&TFI))
9605480093f4SDimitry Andric return CM_ScalarEpilogueNotNeededUsePredicate;
9606480093f4SDimitry Andric
9607480093f4SDimitry Andric return CM_ScalarEpilogueAllowed;
96088bcb0991SDimitry Andric }
96098bcb0991SDimitry Andric
96100b57cec5SDimitry Andric // Process the loop in the VPlan-native vectorization path. This path builds
96110b57cec5SDimitry Andric // VPlan upfront in the vectorization pipeline, which allows to apply
96120b57cec5SDimitry Andric // VPlan-to-VPlan transformations from the very beginning without modifying the
96130b57cec5SDimitry Andric // input LLVM IR.
processLoopInVPlanNativePath(Loop * L,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,LoopVectorizationLegality * LVL,TargetTransformInfo * TTI,TargetLibraryInfo * TLI,DemandedBits * DB,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,LoopVectorizeHints & Hints,LoopVectorizationRequirements & Requirements)96140b57cec5SDimitry Andric static bool processLoopInVPlanNativePath(
96150b57cec5SDimitry Andric Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
96160b57cec5SDimitry Andric LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
96170b57cec5SDimitry Andric TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
96180b57cec5SDimitry Andric OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
9619fe6060f1SDimitry Andric ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
9620fe6060f1SDimitry Andric LoopVectorizationRequirements &Requirements) {
96210b57cec5SDimitry Andric
9622e8d8bef9SDimitry Andric if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
96235ffd83dbSDimitry Andric LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
96245ffd83dbSDimitry Andric return false;
96255ffd83dbSDimitry Andric }
96260b57cec5SDimitry Andric assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
96270b57cec5SDimitry Andric Function *F = L->getHeader()->getParent();
96280b57cec5SDimitry Andric InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9629480093f4SDimitry Andric
9630fe013be4SDimitry Andric ScalarEpilogueLowering SEL =
9631fe013be4SDimitry Andric getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
96328bcb0991SDimitry Andric
96338bcb0991SDimitry Andric LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
96340b57cec5SDimitry Andric &Hints, IAI);
96350b57cec5SDimitry Andric // Use the planner for outer loop vectorization.
96360b57cec5SDimitry Andric // TODO: CM is not used at this point inside the planner. Turn CM into an
96370b57cec5SDimitry Andric // optional argument if we don't need it in the future.
9638c9157d92SDimitry Andric LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
9639c9157d92SDimitry Andric ORE);
96400b57cec5SDimitry Andric
96410b57cec5SDimitry Andric // Get user vectorization factor.
9642e8d8bef9SDimitry Andric ElementCount UserVF = Hints.getWidth();
96430b57cec5SDimitry Andric
9644fe6060f1SDimitry Andric CM.collectElementTypesForWidening();
9645fe6060f1SDimitry Andric
96460b57cec5SDimitry Andric // Plan how to best vectorize, return the best VF and its cost.
96478bcb0991SDimitry Andric const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
96480b57cec5SDimitry Andric
96490b57cec5SDimitry Andric // If we are stress testing VPlan builds, do not attempt to generate vector
96500b57cec5SDimitry Andric // code. Masked vector code generation support will follow soon.
96510b57cec5SDimitry Andric // Also, do not attempt to vectorize if no vector code will be produced.
965281ad6265SDimitry Andric if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF)
96530b57cec5SDimitry Andric return false;
96540b57cec5SDimitry Andric
9655349cc55cSDimitry Andric VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
96560b57cec5SDimitry Andric
9657fe6060f1SDimitry Andric {
9658c9157d92SDimitry Andric bool AddBranchWeights =
9659c9157d92SDimitry Andric hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
9660753f127fSDimitry Andric GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9661c9157d92SDimitry Andric F->getParent()->getDataLayout(), AddBranchWeights);
9662753f127fSDimitry Andric InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
9663753f127fSDimitry Andric VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
96640b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
96650b57cec5SDimitry Andric << L->getHeader()->getParent()->getName() << "\"\n");
966681ad6265SDimitry Andric LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
9667fe6060f1SDimitry Andric }
96680b57cec5SDimitry Andric
9669c9157d92SDimitry Andric reportVectorization(ORE, L, VF, 1);
9670c9157d92SDimitry Andric
96710b57cec5SDimitry Andric // Mark the loop as already vectorized to avoid vectorizing again.
96720b57cec5SDimitry Andric Hints.setAlreadyVectorized();
96735ffd83dbSDimitry Andric assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
96740b57cec5SDimitry Andric return true;
96750b57cec5SDimitry Andric }
96760b57cec5SDimitry Andric
9677fe6060f1SDimitry Andric // Emit a remark if there are stores to floats that required a floating point
9678fe6060f1SDimitry Andric // extension. If the vectorized loop was generated with floating point there
9679fe6060f1SDimitry Andric // will be a performance penalty from the conversion overhead and the change in
9680fe6060f1SDimitry Andric // the vector width.
checkMixedPrecision(Loop * L,OptimizationRemarkEmitter * ORE)9681fe6060f1SDimitry Andric static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
9682fe6060f1SDimitry Andric SmallVector<Instruction *, 4> Worklist;
9683fe6060f1SDimitry Andric for (BasicBlock *BB : L->getBlocks()) {
9684fe6060f1SDimitry Andric for (Instruction &Inst : *BB) {
9685fe6060f1SDimitry Andric if (auto *S = dyn_cast<StoreInst>(&Inst)) {
9686fe6060f1SDimitry Andric if (S->getValueOperand()->getType()->isFloatTy())
9687fe6060f1SDimitry Andric Worklist.push_back(S);
9688fe6060f1SDimitry Andric }
9689fe6060f1SDimitry Andric }
9690fe6060f1SDimitry Andric }
9691fe6060f1SDimitry Andric
9692fe6060f1SDimitry Andric // Traverse the floating point stores upwards searching, for floating point
9693fe6060f1SDimitry Andric // conversions.
9694fe6060f1SDimitry Andric SmallPtrSet<const Instruction *, 4> Visited;
9695fe6060f1SDimitry Andric SmallPtrSet<const Instruction *, 4> EmittedRemark;
9696fe6060f1SDimitry Andric while (!Worklist.empty()) {
9697fe6060f1SDimitry Andric auto *I = Worklist.pop_back_val();
9698fe6060f1SDimitry Andric if (!L->contains(I))
9699fe6060f1SDimitry Andric continue;
9700fe6060f1SDimitry Andric if (!Visited.insert(I).second)
9701fe6060f1SDimitry Andric continue;
9702fe6060f1SDimitry Andric
9703fe6060f1SDimitry Andric // Emit a remark if the floating point store required a floating
9704fe6060f1SDimitry Andric // point conversion.
9705fe6060f1SDimitry Andric // TODO: More work could be done to identify the root cause such as a
9706fe6060f1SDimitry Andric // constant or a function return type and point the user to it.
9707fe6060f1SDimitry Andric if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
9708fe6060f1SDimitry Andric ORE->emit([&]() {
9709fe6060f1SDimitry Andric return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
9710fe6060f1SDimitry Andric I->getDebugLoc(), L->getHeader())
9711fe6060f1SDimitry Andric << "floating point conversion changes vector width. "
9712fe6060f1SDimitry Andric << "Mixed floating point precision requires an up/down "
9713fe6060f1SDimitry Andric << "cast that will negatively impact performance.";
9714fe6060f1SDimitry Andric });
9715fe6060f1SDimitry Andric
9716fe6060f1SDimitry Andric for (Use &Op : I->operands())
9717fe6060f1SDimitry Andric if (auto *OpI = dyn_cast<Instruction>(Op))
9718fe6060f1SDimitry Andric Worklist.push_back(OpI);
9719fe6060f1SDimitry Andric }
9720fe6060f1SDimitry Andric }
9721fe6060f1SDimitry Andric
areRuntimeChecksProfitable(GeneratedRTChecks & Checks,VectorizationFactor & VF,std::optional<unsigned> VScale,Loop * L,ScalarEvolution & SE,ScalarEpilogueLowering SEL)9722753f127fSDimitry Andric static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
9723753f127fSDimitry Andric VectorizationFactor &VF,
9724bdd1243dSDimitry Andric std::optional<unsigned> VScale, Loop *L,
9725c9157d92SDimitry Andric ScalarEvolution &SE,
9726c9157d92SDimitry Andric ScalarEpilogueLowering SEL) {
9727753f127fSDimitry Andric InstructionCost CheckCost = Checks.getCost();
9728753f127fSDimitry Andric if (!CheckCost.isValid())
9729753f127fSDimitry Andric return false;
9730753f127fSDimitry Andric
9731753f127fSDimitry Andric // When interleaving only scalar and vector cost will be equal, which in turn
9732753f127fSDimitry Andric // would lead to a divide by 0. Fall back to hard threshold.
9733753f127fSDimitry Andric if (VF.Width.isScalar()) {
9734753f127fSDimitry Andric if (CheckCost > VectorizeMemoryCheckThreshold) {
9735753f127fSDimitry Andric LLVM_DEBUG(
9736753f127fSDimitry Andric dbgs()
9737753f127fSDimitry Andric << "LV: Interleaving only is not profitable due to runtime checks\n");
9738753f127fSDimitry Andric return false;
9739753f127fSDimitry Andric }
9740753f127fSDimitry Andric return true;
9741753f127fSDimitry Andric }
9742753f127fSDimitry Andric
9743753f127fSDimitry Andric // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
9744753f127fSDimitry Andric double ScalarC = *VF.ScalarCost.getValue();
9745753f127fSDimitry Andric if (ScalarC == 0)
9746753f127fSDimitry Andric return true;
9747753f127fSDimitry Andric
9748753f127fSDimitry Andric // First, compute the minimum iteration count required so that the vector
9749753f127fSDimitry Andric // loop outperforms the scalar loop.
9750753f127fSDimitry Andric // The total cost of the scalar loop is
9751753f127fSDimitry Andric // ScalarC * TC
9752753f127fSDimitry Andric // where
9753753f127fSDimitry Andric // * TC is the actual trip count of the loop.
9754753f127fSDimitry Andric // * ScalarC is the cost of a single scalar iteration.
9755753f127fSDimitry Andric //
9756753f127fSDimitry Andric // The total cost of the vector loop is
9757753f127fSDimitry Andric // RtC + VecC * (TC / VF) + EpiC
9758753f127fSDimitry Andric // where
9759753f127fSDimitry Andric // * RtC is the cost of the generated runtime checks
9760753f127fSDimitry Andric // * VecC is the cost of a single vector iteration.
9761753f127fSDimitry Andric // * TC is the actual trip count of the loop
9762753f127fSDimitry Andric // * VF is the vectorization factor
9763753f127fSDimitry Andric // * EpiCost is the cost of the generated epilogue, including the cost
9764753f127fSDimitry Andric // of the remaining scalar operations.
9765753f127fSDimitry Andric //
9766753f127fSDimitry Andric // Vectorization is profitable once the total vector cost is less than the
9767753f127fSDimitry Andric // total scalar cost:
9768753f127fSDimitry Andric // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC
9769753f127fSDimitry Andric //
9770753f127fSDimitry Andric // Now we can compute the minimum required trip count TC as
9771753f127fSDimitry Andric // (RtC + EpiC) / (ScalarC - (VecC / VF)) < TC
9772753f127fSDimitry Andric //
9773753f127fSDimitry Andric // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
9774753f127fSDimitry Andric // the computations are performed on doubles, not integers and the result
9775753f127fSDimitry Andric // is rounded up, hence we get an upper estimate of the TC.
9776753f127fSDimitry Andric unsigned IntVF = VF.Width.getKnownMinValue();
9777753f127fSDimitry Andric if (VF.Width.isScalable()) {
9778753f127fSDimitry Andric unsigned AssumedMinimumVscale = 1;
9779753f127fSDimitry Andric if (VScale)
9780753f127fSDimitry Andric AssumedMinimumVscale = *VScale;
9781753f127fSDimitry Andric IntVF *= AssumedMinimumVscale;
9782753f127fSDimitry Andric }
9783753f127fSDimitry Andric double VecCOverVF = double(*VF.Cost.getValue()) / IntVF;
9784753f127fSDimitry Andric double RtC = *CheckCost.getValue();
9785753f127fSDimitry Andric double MinTC1 = RtC / (ScalarC - VecCOverVF);
9786753f127fSDimitry Andric
9787753f127fSDimitry Andric // Second, compute a minimum iteration count so that the cost of the
9788753f127fSDimitry Andric // runtime checks is only a fraction of the total scalar loop cost. This
9789753f127fSDimitry Andric // adds a loop-dependent bound on the overhead incurred if the runtime
9790753f127fSDimitry Andric // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
9791753f127fSDimitry Andric // * TC. To bound the runtime check to be a fraction 1/X of the scalar
9792753f127fSDimitry Andric // cost, compute
9793753f127fSDimitry Andric // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC
9794753f127fSDimitry Andric double MinTC2 = RtC * 10 / ScalarC;
9795753f127fSDimitry Andric
9796c9157d92SDimitry Andric // Now pick the larger minimum. If it is not a multiple of VF and a scalar
9797c9157d92SDimitry Andric // epilogue is allowed, choose the next closest multiple of VF. This should
9798c9157d92SDimitry Andric // partly compensate for ignoring the epilogue cost.
9799753f127fSDimitry Andric uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2));
9800c9157d92SDimitry Andric if (SEL == CM_ScalarEpilogueAllowed)
9801c9157d92SDimitry Andric MinTC = alignTo(MinTC, IntVF);
9802c9157d92SDimitry Andric VF.MinProfitableTripCount = ElementCount::getFixed(MinTC);
9803753f127fSDimitry Andric
9804753f127fSDimitry Andric LLVM_DEBUG(
9805753f127fSDimitry Andric dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
9806753f127fSDimitry Andric << VF.MinProfitableTripCount << "\n");
9807753f127fSDimitry Andric
9808753f127fSDimitry Andric // Skip vectorization if the expected trip count is less than the minimum
9809753f127fSDimitry Andric // required trip count.
9810753f127fSDimitry Andric if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) {
9811753f127fSDimitry Andric if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC),
9812753f127fSDimitry Andric VF.MinProfitableTripCount)) {
9813753f127fSDimitry Andric LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
9814753f127fSDimitry Andric "trip count < minimum profitable VF ("
9815753f127fSDimitry Andric << *ExpectedTC << " < " << VF.MinProfitableTripCount
9816753f127fSDimitry Andric << ")\n");
9817753f127fSDimitry Andric
9818753f127fSDimitry Andric return false;
9819753f127fSDimitry Andric }
9820753f127fSDimitry Andric }
9821753f127fSDimitry Andric return true;
9822753f127fSDimitry Andric }
9823753f127fSDimitry Andric
LoopVectorizePass(LoopVectorizeOptions Opts)98245ffd83dbSDimitry Andric LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
98255ffd83dbSDimitry Andric : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
98265ffd83dbSDimitry Andric !EnableLoopInterleaving),
98275ffd83dbSDimitry Andric VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
98285ffd83dbSDimitry Andric !EnableLoopVectorization) {}
98295ffd83dbSDimitry Andric
processLoop(Loop * L)98300b57cec5SDimitry Andric bool LoopVectorizePass::processLoop(Loop *L) {
9831e8d8bef9SDimitry Andric assert((EnableVPlanNativePath || L->isInnermost()) &&
98320b57cec5SDimitry Andric "VPlan-native path is not enabled. Only process inner loops.");
98330b57cec5SDimitry Andric
98340b57cec5SDimitry Andric #ifndef NDEBUG
98350b57cec5SDimitry Andric const std::string DebugLocStr = getDebugLocString(L);
98360b57cec5SDimitry Andric #endif /* NDEBUG */
98370b57cec5SDimitry Andric
983881ad6265SDimitry Andric LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
983981ad6265SDimitry Andric << L->getHeader()->getParent()->getName() << "' from "
98400b57cec5SDimitry Andric << DebugLocStr << "\n");
98410b57cec5SDimitry Andric
98420eae32dcSDimitry Andric LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
98430b57cec5SDimitry Andric
98440b57cec5SDimitry Andric LLVM_DEBUG(
98450b57cec5SDimitry Andric dbgs() << "LV: Loop hints:"
98460b57cec5SDimitry Andric << " force="
98470b57cec5SDimitry Andric << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
98480b57cec5SDimitry Andric ? "disabled"
98490b57cec5SDimitry Andric : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
98500b57cec5SDimitry Andric ? "enabled"
98510b57cec5SDimitry Andric : "?"))
98520b57cec5SDimitry Andric << " width=" << Hints.getWidth()
9853fe6060f1SDimitry Andric << " interleave=" << Hints.getInterleave() << "\n");
98540b57cec5SDimitry Andric
98550b57cec5SDimitry Andric // Function containing loop
98560b57cec5SDimitry Andric Function *F = L->getHeader()->getParent();
98570b57cec5SDimitry Andric
98580b57cec5SDimitry Andric // Looking at the diagnostic output is the only way to determine if a loop
98590b57cec5SDimitry Andric // was vectorized (other than looking at the IR or machine code), so it
98600b57cec5SDimitry Andric // is important to generate an optimization remark for each loop. Most of
98610b57cec5SDimitry Andric // these messages are generated as OptimizationRemarkAnalysis. Remarks
98620b57cec5SDimitry Andric // generated as OptimizationRemark and OptimizationRemarkMissed are
98630b57cec5SDimitry Andric // less verbose reporting vectorized loops and unvectorized loops that may
98640b57cec5SDimitry Andric // benefit from vectorization, respectively.
98650b57cec5SDimitry Andric
98660b57cec5SDimitry Andric if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
98670b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
98680b57cec5SDimitry Andric return false;
98690b57cec5SDimitry Andric }
98700b57cec5SDimitry Andric
98710b57cec5SDimitry Andric PredicatedScalarEvolution PSE(*SE, *L);
98720b57cec5SDimitry Andric
98730b57cec5SDimitry Andric // Check if it is legal to vectorize the loop.
9874fe6060f1SDimitry Andric LoopVectorizationRequirements Requirements;
9875bdd1243dSDimitry Andric LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
9876e8d8bef9SDimitry Andric &Requirements, &Hints, DB, AC, BFI, PSI);
98770b57cec5SDimitry Andric if (!LVL.canVectorize(EnableVPlanNativePath)) {
98780b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
98790b57cec5SDimitry Andric Hints.emitRemarkWithHints();
98800b57cec5SDimitry Andric return false;
98810b57cec5SDimitry Andric }
98820b57cec5SDimitry Andric
98830b57cec5SDimitry Andric // Entrance to the VPlan-native vectorization path. Outer loops are processed
98840b57cec5SDimitry Andric // here. They may require CFG and instruction level transformations before
98850b57cec5SDimitry Andric // even evaluating whether vectorization is profitable. Since we cannot modify
98860b57cec5SDimitry Andric // the incoming IR, we need to build VPlan upfront in the vectorization
98870b57cec5SDimitry Andric // pipeline.
9888e8d8bef9SDimitry Andric if (!L->isInnermost())
98890b57cec5SDimitry Andric return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
9890fe6060f1SDimitry Andric ORE, BFI, PSI, Hints, Requirements);
98910b57cec5SDimitry Andric
9892e8d8bef9SDimitry Andric assert(L->isInnermost() && "Inner loop expected.");
98938bcb0991SDimitry Andric
9894bdd1243dSDimitry Andric InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9895bdd1243dSDimitry Andric bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9896bdd1243dSDimitry Andric
9897bdd1243dSDimitry Andric // If an override option has been passed in for interleaved accesses, use it.
9898bdd1243dSDimitry Andric if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
9899bdd1243dSDimitry Andric UseInterleaved = EnableInterleavedMemAccesses;
9900bdd1243dSDimitry Andric
9901bdd1243dSDimitry Andric // Analyze interleaved memory accesses.
9902bdd1243dSDimitry Andric if (UseInterleaved)
9903bdd1243dSDimitry Andric IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
9904bdd1243dSDimitry Andric
9905bdd1243dSDimitry Andric // Check the function attributes and profiles to find out if this function
9906bdd1243dSDimitry Andric // should be optimized for size.
9907fe013be4SDimitry Andric ScalarEpilogueLowering SEL =
9908fe013be4SDimitry Andric getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI);
9909bdd1243dSDimitry Andric
99100b57cec5SDimitry Andric // Check the loop for a trip count threshold: vectorize loops with a tiny trip
99110b57cec5SDimitry Andric // count by optimizing for size, to minimize overheads.
99128bcb0991SDimitry Andric auto ExpectedTC = getSmallBestKnownTC(*SE, L);
99138bcb0991SDimitry Andric if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
99140b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
99150b57cec5SDimitry Andric << "This loop is worth vectorizing only if no scalar "
99160b57cec5SDimitry Andric << "iteration overheads are incurred.");
99170b57cec5SDimitry Andric if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
99180b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
99190b57cec5SDimitry Andric else {
992061cfbce3SDimitry Andric if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
99210b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "\n");
9922c9157d92SDimitry Andric // Predicate tail-folded loops are efficient even when the loop
9923c9157d92SDimitry Andric // iteration count is low. However, setting the epilogue policy to
9924c9157d92SDimitry Andric // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
9925c9157d92SDimitry Andric // with runtime checks. It's more effective to let
9926c9157d92SDimitry Andric // `areRuntimeChecksProfitable` determine if vectorization is beneficial
9927c9157d92SDimitry Andric // for the loop.
9928c9157d92SDimitry Andric if (SEL != CM_ScalarEpilogueNotNeededUsePredicate)
99298bcb0991SDimitry Andric SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
993061cfbce3SDimitry Andric } else {
993161cfbce3SDimitry Andric LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
993261cfbce3SDimitry Andric "small to consider vectorizing.\n");
993361cfbce3SDimitry Andric reportVectorizationFailure(
993461cfbce3SDimitry Andric "The trip count is below the minial threshold value.",
993561cfbce3SDimitry Andric "loop trip count is too low, avoiding vectorization",
993661cfbce3SDimitry Andric "LowTripCount", ORE, L);
993761cfbce3SDimitry Andric Hints.emitRemarkWithHints();
993861cfbce3SDimitry Andric return false;
993961cfbce3SDimitry Andric }
99400b57cec5SDimitry Andric }
99410b57cec5SDimitry Andric }
99420b57cec5SDimitry Andric
9943bdd1243dSDimitry Andric // Check the function attributes to see if implicit floats or vectors are
9944bdd1243dSDimitry Andric // allowed.
99450b57cec5SDimitry Andric if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
99468bcb0991SDimitry Andric reportVectorizationFailure(
99478bcb0991SDimitry Andric "Can't vectorize when the NoImplicitFloat attribute is used",
99488bcb0991SDimitry Andric "loop not vectorized due to NoImplicitFloat attribute",
99498bcb0991SDimitry Andric "NoImplicitFloat", ORE, L);
99500b57cec5SDimitry Andric Hints.emitRemarkWithHints();
99510b57cec5SDimitry Andric return false;
99520b57cec5SDimitry Andric }
99530b57cec5SDimitry Andric
99540b57cec5SDimitry Andric // Check if the target supports potentially unsafe FP vectorization.
99550b57cec5SDimitry Andric // FIXME: Add a check for the type of safety issue (denormal, signaling)
99560b57cec5SDimitry Andric // for the target we're vectorizing for, to make sure none of the
99570b57cec5SDimitry Andric // additional fp-math flags can help.
99580b57cec5SDimitry Andric if (Hints.isPotentiallyUnsafe() &&
99590b57cec5SDimitry Andric TTI->isFPVectorizationPotentiallyUnsafe()) {
99608bcb0991SDimitry Andric reportVectorizationFailure(
99618bcb0991SDimitry Andric "Potentially unsafe FP op prevents vectorization",
99628bcb0991SDimitry Andric "loop not vectorized due to unsafe FP support.",
99638bcb0991SDimitry Andric "UnsafeFP", ORE, L);
99640b57cec5SDimitry Andric Hints.emitRemarkWithHints();
99650b57cec5SDimitry Andric return false;
99660b57cec5SDimitry Andric }
99670b57cec5SDimitry Andric
9968349cc55cSDimitry Andric bool AllowOrderedReductions;
9969349cc55cSDimitry Andric // If the flag is set, use that instead and override the TTI behaviour.
9970349cc55cSDimitry Andric if (ForceOrderedReductions.getNumOccurrences() > 0)
9971349cc55cSDimitry Andric AllowOrderedReductions = ForceOrderedReductions;
9972349cc55cSDimitry Andric else
9973349cc55cSDimitry Andric AllowOrderedReductions = TTI->enableOrderedReductions();
9974349cc55cSDimitry Andric if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
9975fe6060f1SDimitry Andric ORE->emit([&]() {
9976fe6060f1SDimitry Andric auto *ExactFPMathInst = Requirements.getExactFPInst();
9977fe6060f1SDimitry Andric return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
9978fe6060f1SDimitry Andric ExactFPMathInst->getDebugLoc(),
9979fe6060f1SDimitry Andric ExactFPMathInst->getParent())
9980fe6060f1SDimitry Andric << "loop not vectorized: cannot prove it is safe to reorder "
9981fe6060f1SDimitry Andric "floating-point operations";
9982fe6060f1SDimitry Andric });
9983fe6060f1SDimitry Andric LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
9984fe6060f1SDimitry Andric "reorder floating-point operations\n");
9985fe6060f1SDimitry Andric Hints.emitRemarkWithHints();
9986fe6060f1SDimitry Andric return false;
9987fe6060f1SDimitry Andric }
9988fe6060f1SDimitry Andric
99890b57cec5SDimitry Andric // Use the cost model.
99908bcb0991SDimitry Andric LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
99918bcb0991SDimitry Andric F, &Hints, IAI);
99920b57cec5SDimitry Andric // Use the planner for vectorization.
9993c9157d92SDimitry Andric LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
9994fe013be4SDimitry Andric ORE);
99950b57cec5SDimitry Andric
99965ffd83dbSDimitry Andric // Get user vectorization factor and interleave count.
9997e8d8bef9SDimitry Andric ElementCount UserVF = Hints.getWidth();
99985ffd83dbSDimitry Andric unsigned UserIC = Hints.getInterleave();
99990b57cec5SDimitry Andric
100000b57cec5SDimitry Andric // Plan how to best vectorize, return the best VF and its cost.
10001bdd1243dSDimitry Andric std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
100020b57cec5SDimitry Andric
100030b57cec5SDimitry Andric VectorizationFactor VF = VectorizationFactor::Disabled();
100040b57cec5SDimitry Andric unsigned IC = 1;
100050b57cec5SDimitry Andric
10006c9157d92SDimitry Andric bool AddBranchWeights =
10007c9157d92SDimitry Andric hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
10008753f127fSDimitry Andric GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
10009c9157d92SDimitry Andric F->getParent()->getDataLayout(), AddBranchWeights);
100100b57cec5SDimitry Andric if (MaybeVF) {
10011753f127fSDimitry Andric VF = *MaybeVF;
10012753f127fSDimitry Andric // Select the interleave count.
10013bdd1243dSDimitry Andric IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
10014753f127fSDimitry Andric
10015753f127fSDimitry Andric unsigned SelectedIC = std::max(IC, UserIC);
10016753f127fSDimitry Andric // Optimistically generate runtime checks if they are needed. Drop them if
10017753f127fSDimitry Andric // they turn out to not be profitable.
10018753f127fSDimitry Andric if (VF.Width.isVector() || SelectedIC > 1)
10019753f127fSDimitry Andric Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
10020753f127fSDimitry Andric
10021753f127fSDimitry Andric // Check if it is profitable to vectorize with runtime checks.
10022753f127fSDimitry Andric bool ForceVectorization =
10023753f127fSDimitry Andric Hints.getForce() == LoopVectorizeHints::FK_Enabled;
10024753f127fSDimitry Andric if (!ForceVectorization &&
10025fe013be4SDimitry Andric !areRuntimeChecksProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L,
10026c9157d92SDimitry Andric *PSE.getSE(), SEL)) {
1002781ad6265SDimitry Andric ORE->emit([&]() {
1002881ad6265SDimitry Andric return OptimizationRemarkAnalysisAliasing(
1002981ad6265SDimitry Andric DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
1003081ad6265SDimitry Andric L->getHeader())
1003181ad6265SDimitry Andric << "loop not vectorized: cannot prove it is safe to reorder "
1003281ad6265SDimitry Andric "memory operations";
1003381ad6265SDimitry Andric });
1003481ad6265SDimitry Andric LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
1003581ad6265SDimitry Andric Hints.emitRemarkWithHints();
1003681ad6265SDimitry Andric return false;
1003781ad6265SDimitry Andric }
100380b57cec5SDimitry Andric }
100390b57cec5SDimitry Andric
100400b57cec5SDimitry Andric // Identify the diagnostic messages that should be produced.
100410b57cec5SDimitry Andric std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
100420b57cec5SDimitry Andric bool VectorizeLoop = true, InterleaveLoop = true;
10043e8d8bef9SDimitry Andric if (VF.Width.isScalar()) {
100440b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
100450b57cec5SDimitry Andric VecDiagMsg = std::make_pair(
100460b57cec5SDimitry Andric "VectorizationNotBeneficial",
100470b57cec5SDimitry Andric "the cost-model indicates that vectorization is not beneficial");
100480b57cec5SDimitry Andric VectorizeLoop = false;
100490b57cec5SDimitry Andric }
100500b57cec5SDimitry Andric
100510b57cec5SDimitry Andric if (!MaybeVF && UserIC > 1) {
100520b57cec5SDimitry Andric // Tell the user interleaving was avoided up-front, despite being explicitly
100530b57cec5SDimitry Andric // requested.
100540b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
100550b57cec5SDimitry Andric "interleaving should be avoided up front\n");
100560b57cec5SDimitry Andric IntDiagMsg = std::make_pair(
100570b57cec5SDimitry Andric "InterleavingAvoided",
100580b57cec5SDimitry Andric "Ignoring UserIC, because interleaving was avoided up front");
100590b57cec5SDimitry Andric InterleaveLoop = false;
100600b57cec5SDimitry Andric } else if (IC == 1 && UserIC <= 1) {
100610b57cec5SDimitry Andric // Tell the user interleaving is not beneficial.
100620b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
100630b57cec5SDimitry Andric IntDiagMsg = std::make_pair(
100640b57cec5SDimitry Andric "InterleavingNotBeneficial",
100650b57cec5SDimitry Andric "the cost-model indicates that interleaving is not beneficial");
100660b57cec5SDimitry Andric InterleaveLoop = false;
100670b57cec5SDimitry Andric if (UserIC == 1) {
100680b57cec5SDimitry Andric IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
100690b57cec5SDimitry Andric IntDiagMsg.second +=
100700b57cec5SDimitry Andric " and is explicitly disabled or interleave count is set to 1";
100710b57cec5SDimitry Andric }
100720b57cec5SDimitry Andric } else if (IC > 1 && UserIC == 1) {
100730b57cec5SDimitry Andric // Tell the user interleaving is beneficial, but it explicitly disabled.
100740b57cec5SDimitry Andric LLVM_DEBUG(
100750b57cec5SDimitry Andric dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
100760b57cec5SDimitry Andric IntDiagMsg = std::make_pair(
100770b57cec5SDimitry Andric "InterleavingBeneficialButDisabled",
100780b57cec5SDimitry Andric "the cost-model indicates that interleaving is beneficial "
100790b57cec5SDimitry Andric "but is explicitly disabled or interleave count is set to 1");
100800b57cec5SDimitry Andric InterleaveLoop = false;
100810b57cec5SDimitry Andric }
100820b57cec5SDimitry Andric
100830b57cec5SDimitry Andric // Override IC if user provided an interleave count.
100840b57cec5SDimitry Andric IC = UserIC > 0 ? UserIC : IC;
100850b57cec5SDimitry Andric
100860b57cec5SDimitry Andric // Emit diagnostic messages, if any.
100870b57cec5SDimitry Andric const char *VAPassName = Hints.vectorizeAnalysisPassName();
100880b57cec5SDimitry Andric if (!VectorizeLoop && !InterleaveLoop) {
100890b57cec5SDimitry Andric // Do not vectorize or interleaving the loop.
100900b57cec5SDimitry Andric ORE->emit([&]() {
100910b57cec5SDimitry Andric return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
100920b57cec5SDimitry Andric L->getStartLoc(), L->getHeader())
100930b57cec5SDimitry Andric << VecDiagMsg.second;
100940b57cec5SDimitry Andric });
100950b57cec5SDimitry Andric ORE->emit([&]() {
100960b57cec5SDimitry Andric return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
100970b57cec5SDimitry Andric L->getStartLoc(), L->getHeader())
100980b57cec5SDimitry Andric << IntDiagMsg.second;
100990b57cec5SDimitry Andric });
101000b57cec5SDimitry Andric return false;
101010b57cec5SDimitry Andric } else if (!VectorizeLoop && InterleaveLoop) {
101020b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
101030b57cec5SDimitry Andric ORE->emit([&]() {
101040b57cec5SDimitry Andric return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
101050b57cec5SDimitry Andric L->getStartLoc(), L->getHeader())
101060b57cec5SDimitry Andric << VecDiagMsg.second;
101070b57cec5SDimitry Andric });
101080b57cec5SDimitry Andric } else if (VectorizeLoop && !InterleaveLoop) {
101090b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
101100b57cec5SDimitry Andric << ") in " << DebugLocStr << '\n');
101110b57cec5SDimitry Andric ORE->emit([&]() {
101120b57cec5SDimitry Andric return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
101130b57cec5SDimitry Andric L->getStartLoc(), L->getHeader())
101140b57cec5SDimitry Andric << IntDiagMsg.second;
101150b57cec5SDimitry Andric });
101160b57cec5SDimitry Andric } else if (VectorizeLoop && InterleaveLoop) {
101170b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
101180b57cec5SDimitry Andric << ") in " << DebugLocStr << '\n');
101190b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
101200b57cec5SDimitry Andric }
101210b57cec5SDimitry Andric
10122fe6060f1SDimitry Andric bool DisableRuntimeUnroll = false;
10123fe6060f1SDimitry Andric MDNode *OrigLoopID = L->getLoopID();
10124fe6060f1SDimitry Andric {
101250b57cec5SDimitry Andric using namespace ore;
101260b57cec5SDimitry Andric if (!VectorizeLoop) {
101270b57cec5SDimitry Andric assert(IC > 1 && "interleave count should not be 1 or 0");
101280b57cec5SDimitry Andric // If we decided that it is not legal to vectorize the loop, then
101290b57cec5SDimitry Andric // interleave it.
10130fe6060f1SDimitry Andric InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
10131fe6060f1SDimitry Andric &CM, BFI, PSI, Checks);
10132349cc55cSDimitry Andric
10133349cc55cSDimitry Andric VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
1013481ad6265SDimitry Andric LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
101350b57cec5SDimitry Andric
101360b57cec5SDimitry Andric ORE->emit([&]() {
101370b57cec5SDimitry Andric return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
101380b57cec5SDimitry Andric L->getHeader())
101390b57cec5SDimitry Andric << "interleaved loop (interleaved count: "
101400b57cec5SDimitry Andric << NV("InterleaveCount", IC) << ")";
101410b57cec5SDimitry Andric });
101420b57cec5SDimitry Andric } else {
101430b57cec5SDimitry Andric // If we decided that it is *legal* to vectorize the loop, then do it.
10144e8d8bef9SDimitry Andric
10145e8d8bef9SDimitry Andric // Consider vectorizing the epilogue too if it's profitable.
10146e8d8bef9SDimitry Andric VectorizationFactor EpilogueVF =
10147fe013be4SDimitry Andric LVP.selectEpilogueVectorizationFactor(VF.Width, IC);
10148e8d8bef9SDimitry Andric if (EpilogueVF.Width.isVector()) {
10149e8d8bef9SDimitry Andric
10150e8d8bef9SDimitry Andric // The first pass vectorizes the main loop and creates a scalar epilogue
10151e8d8bef9SDimitry Andric // to be vectorized by executing the plan (potentially with a different
10152e8d8bef9SDimitry Andric // factor) again shortly afterwards.
10153349cc55cSDimitry Andric EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
10154fe6060f1SDimitry Andric EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10155fe6060f1SDimitry Andric EPI, &LVL, &CM, BFI, PSI, Checks);
10156e8d8bef9SDimitry Andric
10157349cc55cSDimitry Andric VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
10158cdc20ff6SDimitry Andric const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan(
10159cdc20ff6SDimitry Andric EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, DT, true);
10160e8d8bef9SDimitry Andric ++LoopsVectorized;
10161e8d8bef9SDimitry Andric
10162e8d8bef9SDimitry Andric // Second pass vectorizes the epilogue and adjusts the control flow
10163e8d8bef9SDimitry Andric // edges from the first pass.
10164e8d8bef9SDimitry Andric EPI.MainLoopVF = EPI.EpilogueVF;
10165e8d8bef9SDimitry Andric EPI.MainLoopUF = EPI.EpilogueUF;
10166e8d8bef9SDimitry Andric EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10167fe6060f1SDimitry Andric ORE, EPI, &LVL, &CM, BFI, PSI,
10168fe6060f1SDimitry Andric Checks);
10169349cc55cSDimitry Andric
10170349cc55cSDimitry Andric VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
1017181ad6265SDimitry Andric VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion();
1017281ad6265SDimitry Andric VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
1017381ad6265SDimitry Andric Header->setName("vec.epilog.vector.body");
1017404eeddc0SDimitry Andric
10175fe013be4SDimitry Andric // Re-use the trip count and steps expanded for the main loop, as
10176fe013be4SDimitry Andric // skeleton creation needs it as a value that dominates both the scalar
10177fe013be4SDimitry Andric // and vector epilogue loops
10178fe013be4SDimitry Andric // TODO: This is a workaround needed for epilogue vectorization and it
10179fe013be4SDimitry Andric // should be removed once induction resume value creation is done
10180fe013be4SDimitry Andric // directly in VPlan.
10181fe013be4SDimitry Andric EpilogILV.setTripCount(MainILV.getTripCount());
10182fe013be4SDimitry Andric for (auto &R : make_early_inc_range(*BestEpiPlan.getPreheader())) {
10183fe013be4SDimitry Andric auto *ExpandR = cast<VPExpandSCEVRecipe>(&R);
10184fe013be4SDimitry Andric auto *ExpandedVal = BestEpiPlan.getVPValueOrAddLiveIn(
10185fe013be4SDimitry Andric ExpandedSCEVs.find(ExpandR->getSCEV())->second);
10186fe013be4SDimitry Andric ExpandR->replaceAllUsesWith(ExpandedVal);
10187fe013be4SDimitry Andric ExpandR->eraseFromParent();
10188fe013be4SDimitry Andric }
10189fe013be4SDimitry Andric
10190bdd1243dSDimitry Andric // Ensure that the start values for any VPWidenIntOrFpInductionRecipe,
10191bdd1243dSDimitry Andric // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated
10192bdd1243dSDimitry Andric // before vectorizing the epilogue loop.
1019304eeddc0SDimitry Andric for (VPRecipeBase &R : Header->phis()) {
10194bdd1243dSDimitry Andric if (isa<VPCanonicalIVPHIRecipe>(&R))
10195bdd1243dSDimitry Andric continue;
10196bdd1243dSDimitry Andric
10197bdd1243dSDimitry Andric Value *ResumeV = nullptr;
10198bdd1243dSDimitry Andric // TODO: Move setting of resume values to prepareToExecute.
1019904eeddc0SDimitry Andric if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10200cdc20ff6SDimitry Andric ResumeV = ReductionResumeValues
10201cdc20ff6SDimitry Andric .find(&ReductionPhi->getRecurrenceDescriptor())
10202cdc20ff6SDimitry Andric ->second;
10203bdd1243dSDimitry Andric } else {
10204bdd1243dSDimitry Andric // Create induction resume values for both widened pointer and
10205bdd1243dSDimitry Andric // integer/fp inductions and update the start value of the induction
10206bdd1243dSDimitry Andric // recipes to use the resume value.
10207bdd1243dSDimitry Andric PHINode *IndPhi = nullptr;
10208bdd1243dSDimitry Andric const InductionDescriptor *ID;
10209bdd1243dSDimitry Andric if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
10210bdd1243dSDimitry Andric IndPhi = cast<PHINode>(Ind->getUnderlyingValue());
10211bdd1243dSDimitry Andric ID = &Ind->getInductionDescriptor();
10212bdd1243dSDimitry Andric } else {
10213bdd1243dSDimitry Andric auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R);
10214bdd1243dSDimitry Andric IndPhi = WidenInd->getPHINode();
10215bdd1243dSDimitry Andric ID = &WidenInd->getInductionDescriptor();
1021604eeddc0SDimitry Andric }
10217bdd1243dSDimitry Andric
10218bdd1243dSDimitry Andric ResumeV = MainILV.createInductionResumeValue(
10219fe013be4SDimitry Andric IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs),
10220fe013be4SDimitry Andric {EPI.MainLoopIterationCountCheck});
1022104eeddc0SDimitry Andric }
10222bdd1243dSDimitry Andric assert(ResumeV && "Must have a resume value");
10223fe013be4SDimitry Andric VPValue *StartVal = BestEpiPlan.getVPValueOrAddLiveIn(ResumeV);
10224bdd1243dSDimitry Andric cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
1022504eeddc0SDimitry Andric }
1022604eeddc0SDimitry Andric
10227349cc55cSDimitry Andric LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10228fe013be4SDimitry Andric DT, true, &ExpandedSCEVs);
10229e8d8bef9SDimitry Andric ++LoopsEpilogueVectorized;
10230e8d8bef9SDimitry Andric
10231e8d8bef9SDimitry Andric if (!MainILV.areSafetyChecksAdded())
10232e8d8bef9SDimitry Andric DisableRuntimeUnroll = true;
10233e8d8bef9SDimitry Andric } else {
10234753f127fSDimitry Andric InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10235753f127fSDimitry Andric VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
10236753f127fSDimitry Andric PSI, Checks);
10237349cc55cSDimitry Andric
10238349cc55cSDimitry Andric VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
1023981ad6265SDimitry Andric LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
102400b57cec5SDimitry Andric ++LoopsVectorized;
102410b57cec5SDimitry Andric
10242fe6060f1SDimitry Andric // Add metadata to disable runtime unrolling a scalar loop when there
10243fe6060f1SDimitry Andric // are no runtime checks about strides and memory. A scalar loop that is
102440b57cec5SDimitry Andric // rarely used is not worth unrolling.
102450b57cec5SDimitry Andric if (!LB.areSafetyChecksAdded())
102460b57cec5SDimitry Andric DisableRuntimeUnroll = true;
10247e8d8bef9SDimitry Andric }
102480b57cec5SDimitry Andric // Report the vectorization decision.
10249c9157d92SDimitry Andric reportVectorization(ORE, L, VF, IC);
102500b57cec5SDimitry Andric }
102510b57cec5SDimitry Andric
10252fe6060f1SDimitry Andric if (ORE->allowExtraAnalysis(LV_NAME))
10253fe6060f1SDimitry Andric checkMixedPrecision(L, ORE);
10254fe6060f1SDimitry Andric }
10255fe6060f1SDimitry Andric
10256bdd1243dSDimitry Andric std::optional<MDNode *> RemainderLoopID =
102570b57cec5SDimitry Andric makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
102580b57cec5SDimitry Andric LLVMLoopVectorizeFollowupEpilogue});
1025981ad6265SDimitry Andric if (RemainderLoopID) {
10260bdd1243dSDimitry Andric L->setLoopID(*RemainderLoopID);
102610b57cec5SDimitry Andric } else {
102620b57cec5SDimitry Andric if (DisableRuntimeUnroll)
102630b57cec5SDimitry Andric AddRuntimeUnrollDisableMetaData(L);
102640b57cec5SDimitry Andric
102650b57cec5SDimitry Andric // Mark the loop as already vectorized to avoid vectorizing again.
102660b57cec5SDimitry Andric Hints.setAlreadyVectorized();
102670b57cec5SDimitry Andric }
102680b57cec5SDimitry Andric
102695ffd83dbSDimitry Andric assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
102700b57cec5SDimitry Andric return true;
102710b57cec5SDimitry Andric }
102720b57cec5SDimitry Andric
runImpl(Function & F,ScalarEvolution & SE_,LoopInfo & LI_,TargetTransformInfo & TTI_,DominatorTree & DT_,BlockFrequencyInfo * BFI_,TargetLibraryInfo * TLI_,DemandedBits & DB_,AssumptionCache & AC_,LoopAccessInfoManager & LAIs_,OptimizationRemarkEmitter & ORE_,ProfileSummaryInfo * PSI_)102735ffd83dbSDimitry Andric LoopVectorizeResult LoopVectorizePass::runImpl(
102740b57cec5SDimitry Andric Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
10275fe013be4SDimitry Andric DominatorTree &DT_, BlockFrequencyInfo *BFI_, TargetLibraryInfo *TLI_,
10276bdd1243dSDimitry Andric DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_,
102770b57cec5SDimitry Andric OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
102780b57cec5SDimitry Andric SE = &SE_;
102790b57cec5SDimitry Andric LI = &LI_;
102800b57cec5SDimitry Andric TTI = &TTI_;
102810b57cec5SDimitry Andric DT = &DT_;
10282fe013be4SDimitry Andric BFI = BFI_;
102830b57cec5SDimitry Andric TLI = TLI_;
102840b57cec5SDimitry Andric AC = &AC_;
10285bdd1243dSDimitry Andric LAIs = &LAIs_;
102860b57cec5SDimitry Andric DB = &DB_;
102870b57cec5SDimitry Andric ORE = &ORE_;
102880b57cec5SDimitry Andric PSI = PSI_;
102890b57cec5SDimitry Andric
102900b57cec5SDimitry Andric // Don't attempt if
102910b57cec5SDimitry Andric // 1. the target claims to have no vector registers, and
102920b57cec5SDimitry Andric // 2. interleaving won't help ILP.
102930b57cec5SDimitry Andric //
102940b57cec5SDimitry Andric // The second condition is necessary because, even if the target has no
102950b57cec5SDimitry Andric // vector registers, loop vectorization may still enable scalar
102960b57cec5SDimitry Andric // interleaving.
102978bcb0991SDimitry Andric if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10298fe013be4SDimitry Andric TTI->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2)
102995ffd83dbSDimitry Andric return LoopVectorizeResult(false, false);
103000b57cec5SDimitry Andric
103015ffd83dbSDimitry Andric bool Changed = false, CFGChanged = false;
103020b57cec5SDimitry Andric
103030b57cec5SDimitry Andric // The vectorizer requires loops to be in simplified form.
103040b57cec5SDimitry Andric // Since simplification may add new inner loops, it has to run before the
103050b57cec5SDimitry Andric // legality and profitability checks. This means running the loop vectorizer
103060b57cec5SDimitry Andric // will simplify all loops, regardless of whether anything end up being
103070b57cec5SDimitry Andric // vectorized.
10308bdd1243dSDimitry Andric for (const auto &L : *LI)
103095ffd83dbSDimitry Andric Changed |= CFGChanged |=
103100b57cec5SDimitry Andric simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
103110b57cec5SDimitry Andric
103120b57cec5SDimitry Andric // Build up a worklist of inner-loops to vectorize. This is necessary as
103130b57cec5SDimitry Andric // the act of vectorizing or partially unrolling a loop creates new loops
103140b57cec5SDimitry Andric // and can invalidate iterators across the loops.
103150b57cec5SDimitry Andric SmallVector<Loop *, 8> Worklist;
103160b57cec5SDimitry Andric
103170b57cec5SDimitry Andric for (Loop *L : *LI)
103180b57cec5SDimitry Andric collectSupportedLoops(*L, LI, ORE, Worklist);
103190b57cec5SDimitry Andric
103200b57cec5SDimitry Andric LoopsAnalyzed += Worklist.size();
103210b57cec5SDimitry Andric
103220b57cec5SDimitry Andric // Now walk the identified inner loops.
103230b57cec5SDimitry Andric while (!Worklist.empty()) {
103240b57cec5SDimitry Andric Loop *L = Worklist.pop_back_val();
103250b57cec5SDimitry Andric
103260b57cec5SDimitry Andric // For the inner loops we actually process, form LCSSA to simplify the
103270b57cec5SDimitry Andric // transform.
103280b57cec5SDimitry Andric Changed |= formLCSSARecursively(*L, *DT, LI, SE);
103290b57cec5SDimitry Andric
103305ffd83dbSDimitry Andric Changed |= CFGChanged |= processLoop(L);
10331bdd1243dSDimitry Andric
10332c9157d92SDimitry Andric if (Changed) {
10333bdd1243dSDimitry Andric LAIs->clear();
10334c9157d92SDimitry Andric
10335c9157d92SDimitry Andric #ifndef NDEBUG
10336c9157d92SDimitry Andric if (VerifySCEV)
10337c9157d92SDimitry Andric SE->verify();
10338c9157d92SDimitry Andric #endif
10339c9157d92SDimitry Andric }
103400b57cec5SDimitry Andric }
103410b57cec5SDimitry Andric
103420b57cec5SDimitry Andric // Process each loop nest in the function.
103435ffd83dbSDimitry Andric return LoopVectorizeResult(Changed, CFGChanged);
103440b57cec5SDimitry Andric }
103450b57cec5SDimitry Andric
run(Function & F,FunctionAnalysisManager & AM)103460b57cec5SDimitry Andric PreservedAnalyses LoopVectorizePass::run(Function &F,
103470b57cec5SDimitry Andric FunctionAnalysisManager &AM) {
103480b57cec5SDimitry Andric auto &LI = AM.getResult<LoopAnalysis>(F);
1034981ad6265SDimitry Andric // There are no loops in the function. Return before computing other expensive
1035081ad6265SDimitry Andric // analyses.
1035181ad6265SDimitry Andric if (LI.empty())
1035281ad6265SDimitry Andric return PreservedAnalyses::all();
1035381ad6265SDimitry Andric auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
103540b57cec5SDimitry Andric auto &TTI = AM.getResult<TargetIRAnalysis>(F);
103550b57cec5SDimitry Andric auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
103560b57cec5SDimitry Andric auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
103570b57cec5SDimitry Andric auto &AC = AM.getResult<AssumptionAnalysis>(F);
103580b57cec5SDimitry Andric auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
103590b57cec5SDimitry Andric auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
103600b57cec5SDimitry Andric
10361bdd1243dSDimitry Andric LoopAccessInfoManager &LAIs = AM.getResult<LoopAccessAnalysis>(F);
103625ffd83dbSDimitry Andric auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
103630b57cec5SDimitry Andric ProfileSummaryInfo *PSI =
103645ffd83dbSDimitry Andric MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10365fe013be4SDimitry Andric BlockFrequencyInfo *BFI = nullptr;
10366fe013be4SDimitry Andric if (PSI && PSI->hasProfileSummary())
10367fe013be4SDimitry Andric BFI = &AM.getResult<BlockFrequencyAnalysis>(F);
103685ffd83dbSDimitry Andric LoopVectorizeResult Result =
10369bdd1243dSDimitry Andric runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AC, LAIs, ORE, PSI);
103705ffd83dbSDimitry Andric if (!Result.MadeAnyChange)
103710b57cec5SDimitry Andric return PreservedAnalyses::all();
103720b57cec5SDimitry Andric PreservedAnalyses PA;
103730b57cec5SDimitry Andric
10374fe013be4SDimitry Andric if (isAssignmentTrackingEnabled(*F.getParent())) {
10375fe013be4SDimitry Andric for (auto &BB : F)
10376fe013be4SDimitry Andric RemoveRedundantDbgInstrs(&BB);
10377fe013be4SDimitry Andric }
10378fe013be4SDimitry Andric
103790b57cec5SDimitry Andric // We currently do not preserve loopinfo/dominator analyses with outer loop
103800b57cec5SDimitry Andric // vectorization. Until this is addressed, mark these analyses as preserved
103810b57cec5SDimitry Andric // only for non-VPlan-native path.
103820b57cec5SDimitry Andric // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
103830b57cec5SDimitry Andric if (!EnableVPlanNativePath) {
103840b57cec5SDimitry Andric PA.preserve<LoopAnalysis>();
103850b57cec5SDimitry Andric PA.preserve<DominatorTreeAnalysis>();
10386fe013be4SDimitry Andric PA.preserve<ScalarEvolutionAnalysis>();
103870b57cec5SDimitry Andric }
103880eae32dcSDimitry Andric
103890eae32dcSDimitry Andric if (Result.MadeCFGChange) {
103900eae32dcSDimitry Andric // Making CFG changes likely means a loop got vectorized. Indicate that
103910eae32dcSDimitry Andric // extra simplification passes should be run.
103920eae32dcSDimitry Andric // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
103930eae32dcSDimitry Andric // be run if runtime checks have been added.
103940eae32dcSDimitry Andric AM.getResult<ShouldRunExtraVectorPasses>(F);
103950eae32dcSDimitry Andric PA.preserve<ShouldRunExtraVectorPasses>();
103960eae32dcSDimitry Andric } else {
103975ffd83dbSDimitry Andric PA.preserveSet<CFGAnalyses>();
103980eae32dcSDimitry Andric }
103990b57cec5SDimitry Andric return PA;
104000b57cec5SDimitry Andric }
10401349cc55cSDimitry Andric
printPipeline(raw_ostream & OS,function_ref<StringRef (StringRef)> MapClassName2PassName)10402349cc55cSDimitry Andric void LoopVectorizePass::printPipeline(
10403349cc55cSDimitry Andric raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10404349cc55cSDimitry Andric static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10405349cc55cSDimitry Andric OS, MapClassName2PassName);
10406349cc55cSDimitry Andric
10407fe013be4SDimitry Andric OS << '<';
10408349cc55cSDimitry Andric OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10409349cc55cSDimitry Andric OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10410fe013be4SDimitry Andric OS << '>';
10411349cc55cSDimitry Andric }
10412