1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanTransforms.h" 62 #include "llvm/ADT/APInt.h" 63 #include "llvm/ADT/ArrayRef.h" 64 #include "llvm/ADT/DenseMap.h" 65 #include "llvm/ADT/DenseMapInfo.h" 66 #include "llvm/ADT/Hashing.h" 67 #include "llvm/ADT/MapVector.h" 68 #include "llvm/ADT/STLExtras.h" 69 #include "llvm/ADT/SmallPtrSet.h" 70 #include "llvm/ADT/SmallSet.h" 71 #include "llvm/ADT/SmallVector.h" 72 #include "llvm/ADT/Statistic.h" 73 #include "llvm/ADT/StringRef.h" 74 #include "llvm/ADT/Twine.h" 75 #include "llvm/ADT/iterator_range.h" 76 #include "llvm/Analysis/AssumptionCache.h" 77 #include "llvm/Analysis/BasicAliasAnalysis.h" 78 #include "llvm/Analysis/BlockFrequencyInfo.h" 79 #include "llvm/Analysis/CFG.h" 80 #include "llvm/Analysis/CodeMetrics.h" 81 #include "llvm/Analysis/DemandedBits.h" 82 #include "llvm/Analysis/GlobalsModRef.h" 83 #include "llvm/Analysis/LoopAccessAnalysis.h" 84 #include "llvm/Analysis/LoopAnalysisManager.h" 85 #include "llvm/Analysis/LoopInfo.h" 86 #include "llvm/Analysis/LoopIterator.h" 87 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 88 #include "llvm/Analysis/ProfileSummaryInfo.h" 89 #include "llvm/Analysis/ScalarEvolution.h" 90 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 91 #include "llvm/Analysis/TargetLibraryInfo.h" 92 #include "llvm/Analysis/TargetTransformInfo.h" 93 #include "llvm/Analysis/ValueTracking.h" 94 #include "llvm/Analysis/VectorUtils.h" 95 #include "llvm/IR/Attributes.h" 96 #include "llvm/IR/BasicBlock.h" 97 #include "llvm/IR/CFG.h" 98 #include "llvm/IR/Constant.h" 99 #include "llvm/IR/Constants.h" 100 #include "llvm/IR/DataLayout.h" 101 #include "llvm/IR/DebugInfo.h" 102 #include "llvm/IR/DebugInfoMetadata.h" 103 #include "llvm/IR/DebugLoc.h" 104 #include "llvm/IR/DerivedTypes.h" 105 #include "llvm/IR/DiagnosticInfo.h" 106 #include "llvm/IR/Dominators.h" 107 #include "llvm/IR/Function.h" 108 #include "llvm/IR/IRBuilder.h" 109 #include "llvm/IR/InstrTypes.h" 110 #include "llvm/IR/Instruction.h" 111 #include "llvm/IR/Instructions.h" 112 #include "llvm/IR/IntrinsicInst.h" 113 #include "llvm/IR/Intrinsics.h" 114 #include "llvm/IR/Metadata.h" 115 #include "llvm/IR/Module.h" 116 #include "llvm/IR/Operator.h" 117 #include "llvm/IR/PatternMatch.h" 118 #include "llvm/IR/Type.h" 119 #include "llvm/IR/Use.h" 120 #include "llvm/IR/User.h" 121 #include "llvm/IR/Value.h" 122 #include "llvm/IR/ValueHandle.h" 123 #include "llvm/IR/Verifier.h" 124 #include "llvm/Support/Casting.h" 125 #include "llvm/Support/CommandLine.h" 126 #include "llvm/Support/Compiler.h" 127 #include "llvm/Support/Debug.h" 128 #include "llvm/Support/ErrorHandling.h" 129 #include "llvm/Support/InstructionCost.h" 130 #include "llvm/Support/MathExtras.h" 131 #include "llvm/Support/raw_ostream.h" 132 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 133 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 134 #include "llvm/Transforms/Utils/LoopSimplify.h" 135 #include "llvm/Transforms/Utils/LoopUtils.h" 136 #include "llvm/Transforms/Utils/LoopVersioning.h" 137 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 138 #include "llvm/Transforms/Utils/SizeOpts.h" 139 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 140 #include <algorithm> 141 #include <cassert> 142 #include <cmath> 143 #include <cstdint> 144 #include <functional> 145 #include <iterator> 146 #include <limits> 147 #include <map> 148 #include <memory> 149 #include <string> 150 #include <tuple> 151 #include <utility> 152 153 using namespace llvm; 154 155 #define LV_NAME "loop-vectorize" 156 #define DEBUG_TYPE LV_NAME 157 158 #ifndef NDEBUG 159 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 160 #endif 161 162 /// @{ 163 /// Metadata attribute names 164 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 165 const char LLVMLoopVectorizeFollowupVectorized[] = 166 "llvm.loop.vectorize.followup_vectorized"; 167 const char LLVMLoopVectorizeFollowupEpilogue[] = 168 "llvm.loop.vectorize.followup_epilogue"; 169 /// @} 170 171 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 172 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 173 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 174 175 static cl::opt<bool> EnableEpilogueVectorization( 176 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 177 cl::desc("Enable vectorization of epilogue loops.")); 178 179 static cl::opt<unsigned> EpilogueVectorizationForceVF( 180 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 181 cl::desc("When epilogue vectorization is enabled, and a value greater than " 182 "1 is specified, forces the given VF for all applicable epilogue " 183 "loops.")); 184 185 static cl::opt<unsigned> EpilogueVectorizationMinVF( 186 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 187 cl::desc("Only loops with vectorization factor equal to or larger than " 188 "the specified value are considered for epilogue vectorization.")); 189 190 /// Loops with a known constant trip count below this number are vectorized only 191 /// if no scalar iteration overheads are incurred. 192 static cl::opt<unsigned> TinyTripCountVectorThreshold( 193 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 194 cl::desc("Loops with a constant trip count that is smaller than this " 195 "value are vectorized only if no scalar iteration overheads " 196 "are incurred.")); 197 198 static cl::opt<unsigned> VectorizeMemoryCheckThreshold( 199 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 200 cl::desc("The maximum allowed number of runtime memory checks")); 201 202 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 203 // that predication is preferred, and this lists all options. I.e., the 204 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 205 // and predicate the instructions accordingly. If tail-folding fails, there are 206 // different fallback strategies depending on these values: 207 namespace PreferPredicateTy { 208 enum Option { 209 ScalarEpilogue = 0, 210 PredicateElseScalarEpilogue, 211 PredicateOrDontVectorize 212 }; 213 } // namespace PreferPredicateTy 214 215 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 216 "prefer-predicate-over-epilogue", 217 cl::init(PreferPredicateTy::ScalarEpilogue), 218 cl::Hidden, 219 cl::desc("Tail-folding and predication preferences over creating a scalar " 220 "epilogue loop."), 221 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 222 "scalar-epilogue", 223 "Don't tail-predicate loops, create scalar epilogue"), 224 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 225 "predicate-else-scalar-epilogue", 226 "prefer tail-folding, create scalar epilogue if tail " 227 "folding fails."), 228 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 229 "predicate-dont-vectorize", 230 "prefers tail-folding, don't attempt vectorization if " 231 "tail-folding fails."))); 232 233 static cl::opt<TailFoldingStyle> ForceTailFoldingStyle( 234 "force-tail-folding-style", cl::desc("Force the tail folding style"), 235 cl::init(TailFoldingStyle::None), 236 cl::values( 237 clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"), 238 clEnumValN( 239 TailFoldingStyle::Data, "data", 240 "Create lane mask for data only, using active.lane.mask intrinsic"), 241 clEnumValN(TailFoldingStyle::DataWithoutLaneMask, 242 "data-without-lane-mask", 243 "Create lane mask with compare/stepvector"), 244 clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control", 245 "Create lane mask using active.lane.mask intrinsic, and use " 246 "it for both data and control flow"), 247 clEnumValN( 248 TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, 249 "data-and-control-without-rt-check", 250 "Similar to data-and-control, but remove the runtime check"))); 251 252 static cl::opt<bool> MaximizeBandwidth( 253 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 254 cl::desc("Maximize bandwidth when selecting vectorization factor which " 255 "will be determined by the smallest type in loop.")); 256 257 static cl::opt<bool> EnableInterleavedMemAccesses( 258 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 259 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 260 261 /// An interleave-group may need masking if it resides in a block that needs 262 /// predication, or in order to mask away gaps. 263 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 264 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 265 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 266 267 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 268 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 269 cl::desc("We don't interleave loops with a estimated constant trip count " 270 "below this number")); 271 272 static cl::opt<unsigned> ForceTargetNumScalarRegs( 273 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 274 cl::desc("A flag that overrides the target's number of scalar registers.")); 275 276 static cl::opt<unsigned> ForceTargetNumVectorRegs( 277 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 278 cl::desc("A flag that overrides the target's number of vector registers.")); 279 280 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 281 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 282 cl::desc("A flag that overrides the target's max interleave factor for " 283 "scalar loops.")); 284 285 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 286 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 287 cl::desc("A flag that overrides the target's max interleave factor for " 288 "vectorized loops.")); 289 290 static cl::opt<unsigned> ForceTargetInstructionCost( 291 "force-target-instruction-cost", cl::init(0), cl::Hidden, 292 cl::desc("A flag that overrides the target's expected cost for " 293 "an instruction to a single constant value. Mostly " 294 "useful for getting consistent testing.")); 295 296 static cl::opt<bool> ForceTargetSupportsScalableVectors( 297 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 298 cl::desc( 299 "Pretend that scalable vectors are supported, even if the target does " 300 "not support them. This flag should only be used for testing.")); 301 302 static cl::opt<unsigned> SmallLoopCost( 303 "small-loop-cost", cl::init(20), cl::Hidden, 304 cl::desc( 305 "The cost of a loop that is considered 'small' by the interleaver.")); 306 307 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 308 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 309 cl::desc("Enable the use of the block frequency analysis to access PGO " 310 "heuristics minimizing code growth in cold regions and being more " 311 "aggressive in hot regions.")); 312 313 // Runtime interleave loops for load/store throughput. 314 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 315 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 316 cl::desc( 317 "Enable runtime interleaving until load/store ports are saturated")); 318 319 /// Interleave small loops with scalar reductions. 320 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 321 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 322 cl::desc("Enable interleaving for loops with small iteration counts that " 323 "contain scalar reductions to expose ILP.")); 324 325 /// The number of stores in a loop that are allowed to need predication. 326 static cl::opt<unsigned> NumberOfStoresToPredicate( 327 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 328 cl::desc("Max number of stores to be predicated behind an if.")); 329 330 static cl::opt<bool> EnableIndVarRegisterHeur( 331 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 332 cl::desc("Count the induction variable only once when interleaving")); 333 334 static cl::opt<bool> EnableCondStoresVectorization( 335 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 336 cl::desc("Enable if predication of stores during vectorization.")); 337 338 static cl::opt<unsigned> MaxNestedScalarReductionIC( 339 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 340 cl::desc("The maximum interleave count to use when interleaving a scalar " 341 "reduction in a nested loop.")); 342 343 static cl::opt<bool> 344 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 345 cl::Hidden, 346 cl::desc("Prefer in-loop vector reductions, " 347 "overriding the targets preference.")); 348 349 static cl::opt<bool> ForceOrderedReductions( 350 "force-ordered-reductions", cl::init(false), cl::Hidden, 351 cl::desc("Enable the vectorisation of loops with in-order (strict) " 352 "FP reductions")); 353 354 static cl::opt<bool> PreferPredicatedReductionSelect( 355 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 356 cl::desc( 357 "Prefer predicating a reduction operation over an after loop select.")); 358 359 namespace llvm { 360 cl::opt<bool> EnableVPlanNativePath( 361 "enable-vplan-native-path", cl::Hidden, 362 cl::desc("Enable VPlan-native vectorization path with " 363 "support for outer loop vectorization.")); 364 } 365 366 // This flag enables the stress testing of the VPlan H-CFG construction in the 367 // VPlan-native vectorization path. It must be used in conjuction with 368 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 369 // verification of the H-CFGs built. 370 static cl::opt<bool> VPlanBuildStressTest( 371 "vplan-build-stress-test", cl::init(false), cl::Hidden, 372 cl::desc( 373 "Build VPlan for every supported loop nest in the function and bail " 374 "out right after the build (stress test the VPlan H-CFG construction " 375 "in the VPlan-native vectorization path).")); 376 377 cl::opt<bool> llvm::EnableLoopInterleaving( 378 "interleave-loops", cl::init(true), cl::Hidden, 379 cl::desc("Enable loop interleaving in Loop vectorization passes")); 380 cl::opt<bool> llvm::EnableLoopVectorization( 381 "vectorize-loops", cl::init(true), cl::Hidden, 382 cl::desc("Run the Loop vectorization passes")); 383 384 static cl::opt<bool> PrintVPlansInDotFormat( 385 "vplan-print-in-dot-format", cl::Hidden, 386 cl::desc("Use dot format instead of plain text when dumping VPlans")); 387 388 static cl::opt<cl::boolOrDefault> ForceSafeDivisor( 389 "force-widen-divrem-via-safe-divisor", cl::Hidden, 390 cl::desc( 391 "Override cost based safe divisor widening for div/rem instructions")); 392 393 /// A helper function that returns true if the given type is irregular. The 394 /// type is irregular if its allocated size doesn't equal the store size of an 395 /// element of the corresponding vector type. 396 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 397 // Determine if an array of N elements of type Ty is "bitcast compatible" 398 // with a <N x Ty> vector. 399 // This is only true if there is no padding between the array elements. 400 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 401 } 402 403 /// A helper function that returns the reciprocal of the block probability of 404 /// predicated blocks. If we return X, we are assuming the predicated block 405 /// will execute once for every X iterations of the loop header. 406 /// 407 /// TODO: We should use actual block probability here, if available. Currently, 408 /// we always assume predicated blocks have a 50% chance of executing. 409 static unsigned getReciprocalPredBlockProb() { return 2; } 410 411 /// A helper function that returns an integer or floating-point constant with 412 /// value C. 413 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 414 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 415 : ConstantFP::get(Ty, C); 416 } 417 418 /// Returns "best known" trip count for the specified loop \p L as defined by 419 /// the following procedure: 420 /// 1) Returns exact trip count if it is known. 421 /// 2) Returns expected trip count according to profile data if any. 422 /// 3) Returns upper bound estimate if it is known. 423 /// 4) Returns std::nullopt if all of the above failed. 424 static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, 425 Loop *L) { 426 // Check if exact trip count is known. 427 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 428 return ExpectedTC; 429 430 // Check if there is an expected trip count available from profile data. 431 if (LoopVectorizeWithBlockFrequency) 432 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 433 return *EstimatedTC; 434 435 // Check if upper bound estimate is known. 436 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 437 return ExpectedTC; 438 439 return std::nullopt; 440 } 441 442 /// Return a vector containing interleaved elements from multiple 443 /// smaller input vectors. 444 static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals, 445 const Twine &Name) { 446 unsigned Factor = Vals.size(); 447 assert(Factor > 1 && "Tried to interleave invalid number of vectors"); 448 449 VectorType *VecTy = cast<VectorType>(Vals[0]->getType()); 450 #ifndef NDEBUG 451 for (Value *Val : Vals) 452 assert(Val->getType() == VecTy && "Tried to interleave mismatched types"); 453 #endif 454 455 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so 456 // must use intrinsics to interleave. 457 if (VecTy->isScalableTy()) { 458 VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy); 459 return Builder.CreateIntrinsic( 460 WideVecTy, Intrinsic::experimental_vector_interleave2, Vals, 461 /*FMFSource=*/nullptr, Name); 462 } 463 464 // Fixed length. Start by concatenating all vectors into a wide vector. 465 Value *WideVec = concatenateVectors(Builder, Vals); 466 467 // Interleave the elements into the wide vector. 468 const unsigned NumElts = VecTy->getElementCount().getFixedValue(); 469 return Builder.CreateShuffleVector( 470 WideVec, createInterleaveMask(NumElts, Factor), Name); 471 } 472 473 namespace { 474 // Forward declare GeneratedRTChecks. 475 class GeneratedRTChecks; 476 477 using SCEV2ValueTy = DenseMap<const SCEV *, Value *>; 478 } // namespace 479 480 namespace llvm { 481 482 AnalysisKey ShouldRunExtraVectorPasses::Key; 483 484 /// InnerLoopVectorizer vectorizes loops which contain only one basic 485 /// block to a specified vectorization factor (VF). 486 /// This class performs the widening of scalars into vectors, or multiple 487 /// scalars. This class also implements the following features: 488 /// * It inserts an epilogue loop for handling loops that don't have iteration 489 /// counts that are known to be a multiple of the vectorization factor. 490 /// * It handles the code generation for reduction variables. 491 /// * Scalarization (implementation using scalars) of un-vectorizable 492 /// instructions. 493 /// InnerLoopVectorizer does not perform any vectorization-legality 494 /// checks, and relies on the caller to check for the different legality 495 /// aspects. The InnerLoopVectorizer relies on the 496 /// LoopVectorizationLegality class to provide information about the induction 497 /// and reduction variables that were found to a given vectorization factor. 498 class InnerLoopVectorizer { 499 public: 500 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 501 LoopInfo *LI, DominatorTree *DT, 502 const TargetLibraryInfo *TLI, 503 const TargetTransformInfo *TTI, AssumptionCache *AC, 504 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 505 ElementCount MinProfitableTripCount, 506 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 507 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 508 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 509 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 510 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 511 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 512 PSI(PSI), RTChecks(RTChecks) { 513 // Query this against the original loop and save it here because the profile 514 // of the original loop header may change as the transformation happens. 515 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 516 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 517 518 if (MinProfitableTripCount.isZero()) 519 this->MinProfitableTripCount = VecWidth; 520 else 521 this->MinProfitableTripCount = MinProfitableTripCount; 522 } 523 524 virtual ~InnerLoopVectorizer() = default; 525 526 /// Create a new empty loop that will contain vectorized instructions later 527 /// on, while the old loop will be used as the scalar remainder. Control flow 528 /// is generated around the vectorized (and scalar epilogue) loops consisting 529 /// of various checks and bypasses. Return the pre-header block of the new 530 /// loop and the start value for the canonical induction, if it is != 0. The 531 /// latter is the case when vectorizing the epilogue loop. In the case of 532 /// epilogue vectorization, this function is overriden to handle the more 533 /// complex control flow around the loops. \p ExpandedSCEVs is used to 534 /// look up SCEV expansions for expressions needed during skeleton creation. 535 virtual std::pair<BasicBlock *, Value *> 536 createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs); 537 538 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 539 void fixVectorizedLoop(VPTransformState &State, VPlan &Plan); 540 541 // Return true if any runtime check is added. 542 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 543 544 /// A type for vectorized values in the new loop. Each value from the 545 /// original loop, when vectorized, is represented by UF vector values in the 546 /// new unrolled loop, where UF is the unroll factor. 547 using VectorParts = SmallVector<Value *, 2>; 548 549 /// A helper function to scalarize a single Instruction in the innermost loop. 550 /// Generates a sequence of scalar instances for each lane between \p MinLane 551 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 552 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p 553 /// Instr's operands. 554 void scalarizeInstruction(const Instruction *Instr, 555 VPReplicateRecipe *RepRecipe, 556 const VPIteration &Instance, 557 VPTransformState &State); 558 559 /// Construct the vector value of a scalarized value \p V one lane at a time. 560 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 561 VPTransformState &State); 562 563 /// Try to vectorize interleaved access group \p Group with the base address 564 /// given in \p Addr, optionally masking the vector operations if \p 565 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 566 /// values in the vectorized loop. 567 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 568 ArrayRef<VPValue *> VPDefs, 569 VPTransformState &State, VPValue *Addr, 570 ArrayRef<VPValue *> StoredValues, 571 VPValue *BlockInMask, bool NeedsMaskForGaps); 572 573 /// Fix the non-induction PHIs in \p Plan. 574 void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State); 575 576 /// Returns true if the reordering of FP operations is not allowed, but we are 577 /// able to vectorize with strict in-order reductions for the given RdxDesc. 578 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc); 579 580 // Returns the resume value (bc.merge.rdx) for a reduction as 581 // generated by fixReduction. 582 PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc); 583 584 /// Create a new phi node for the induction variable \p OrigPhi to resume 585 /// iteration count in the scalar epilogue, from where the vectorized loop 586 /// left off. \p Step is the SCEV-expanded induction step to use. In cases 587 /// where the loop skeleton is more complicated (i.e., epilogue vectorization) 588 /// and the resume values can come from an additional bypass block, the \p 589 /// AdditionalBypass pair provides information about the bypass block and the 590 /// end value on the edge from bypass to this loop. 591 PHINode *createInductionResumeValue( 592 PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step, 593 ArrayRef<BasicBlock *> BypassBlocks, 594 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 595 596 /// Returns the original loop trip count. 597 Value *getTripCount() const { return TripCount; } 598 599 /// Used to set the trip count after ILV's construction and after the 600 /// preheader block has been executed. Note that this always holds the trip 601 /// count of the original loop for both main loop and epilogue vectorization. 602 void setTripCount(Value *TC) { TripCount = TC; } 603 604 protected: 605 friend class LoopVectorizationPlanner; 606 607 /// A small list of PHINodes. 608 using PhiVector = SmallVector<PHINode *, 4>; 609 610 /// A type for scalarized values in the new loop. Each value from the 611 /// original loop, when scalarized, is represented by UF x VF scalar values 612 /// in the new unrolled loop, where UF is the unroll factor and VF is the 613 /// vectorization factor. 614 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 615 616 /// Set up the values of the IVs correctly when exiting the vector loop. 617 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 618 Value *VectorTripCount, Value *EndValue, 619 BasicBlock *MiddleBlock, BasicBlock *VectorHeader, 620 VPlan &Plan, VPTransformState &State); 621 622 /// Handle all cross-iteration phis in the header. 623 void fixCrossIterationPHIs(VPTransformState &State); 624 625 /// Create the exit value of first order recurrences in the middle block and 626 /// update their users. 627 void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR, 628 VPTransformState &State); 629 630 /// Create code for the loop exit value of the reduction. 631 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); 632 633 /// Iteratively sink the scalarized operands of a predicated instruction into 634 /// the block that was created for it. 635 void sinkScalarOperands(Instruction *PredInst); 636 637 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 638 /// represented as. 639 void truncateToMinimalBitwidths(VPTransformState &State); 640 641 /// Returns (and creates if needed) the trip count of the widened loop. 642 Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock); 643 644 /// Returns a bitcasted value to the requested vector type. 645 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 646 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 647 const DataLayout &DL); 648 649 /// Emit a bypass check to see if the vector trip count is zero, including if 650 /// it overflows. 651 void emitIterationCountCheck(BasicBlock *Bypass); 652 653 /// Emit a bypass check to see if all of the SCEV assumptions we've 654 /// had to make are correct. Returns the block containing the checks or 655 /// nullptr if no checks have been added. 656 BasicBlock *emitSCEVChecks(BasicBlock *Bypass); 657 658 /// Emit bypass checks to check any memory assumptions we may have made. 659 /// Returns the block containing the checks or nullptr if no checks have been 660 /// added. 661 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass); 662 663 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 664 /// vector loop preheader, middle block and scalar preheader. 665 void createVectorLoopSkeleton(StringRef Prefix); 666 667 /// Create new phi nodes for the induction variables to resume iteration count 668 /// in the scalar epilogue, from where the vectorized loop left off. 669 /// In cases where the loop skeleton is more complicated (eg. epilogue 670 /// vectorization) and the resume values can come from an additional bypass 671 /// block, the \p AdditionalBypass pair provides information about the bypass 672 /// block and the end value on the edge from bypass to this loop. 673 void createInductionResumeValues( 674 const SCEV2ValueTy &ExpandedSCEVs, 675 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 676 677 /// Complete the loop skeleton by adding debug MDs, creating appropriate 678 /// conditional branches in the middle block, preparing the builder and 679 /// running the verifier. Return the preheader of the completed vector loop. 680 BasicBlock *completeLoopSkeleton(); 681 682 /// Collect poison-generating recipes that may generate a poison value that is 683 /// used after vectorization, even when their operands are not poison. Those 684 /// recipes meet the following conditions: 685 /// * Contribute to the address computation of a recipe generating a widen 686 /// memory load/store (VPWidenMemoryInstructionRecipe or 687 /// VPInterleaveRecipe). 688 /// * Such a widen memory load/store has at least one underlying Instruction 689 /// that is in a basic block that needs predication and after vectorization 690 /// the generated instruction won't be predicated. 691 void collectPoisonGeneratingRecipes(VPTransformState &State); 692 693 /// Allow subclasses to override and print debug traces before/after vplan 694 /// execution, when trace information is requested. 695 virtual void printDebugTracesAtStart(){}; 696 virtual void printDebugTracesAtEnd(){}; 697 698 /// The original loop. 699 Loop *OrigLoop; 700 701 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 702 /// dynamic knowledge to simplify SCEV expressions and converts them to a 703 /// more usable form. 704 PredicatedScalarEvolution &PSE; 705 706 /// Loop Info. 707 LoopInfo *LI; 708 709 /// Dominator Tree. 710 DominatorTree *DT; 711 712 /// Target Library Info. 713 const TargetLibraryInfo *TLI; 714 715 /// Target Transform Info. 716 const TargetTransformInfo *TTI; 717 718 /// Assumption Cache. 719 AssumptionCache *AC; 720 721 /// Interface to emit optimization remarks. 722 OptimizationRemarkEmitter *ORE; 723 724 /// The vectorization SIMD factor to use. Each vector will have this many 725 /// vector elements. 726 ElementCount VF; 727 728 ElementCount MinProfitableTripCount; 729 730 /// The vectorization unroll factor to use. Each scalar is vectorized to this 731 /// many different vector instructions. 732 unsigned UF; 733 734 /// The builder that we use 735 IRBuilder<> Builder; 736 737 // --- Vectorization state --- 738 739 /// The vector-loop preheader. 740 BasicBlock *LoopVectorPreHeader; 741 742 /// The scalar-loop preheader. 743 BasicBlock *LoopScalarPreHeader; 744 745 /// Middle Block between the vector and the scalar. 746 BasicBlock *LoopMiddleBlock; 747 748 /// The unique ExitBlock of the scalar loop if one exists. Note that 749 /// there can be multiple exiting edges reaching this block. 750 BasicBlock *LoopExitBlock; 751 752 /// The scalar loop body. 753 BasicBlock *LoopScalarBody; 754 755 /// A list of all bypass blocks. The first block is the entry of the loop. 756 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 757 758 /// Store instructions that were predicated. 759 SmallVector<Instruction *, 4> PredicatedInstructions; 760 761 /// Trip count of the original loop. 762 Value *TripCount = nullptr; 763 764 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 765 Value *VectorTripCount = nullptr; 766 767 /// The legality analysis. 768 LoopVectorizationLegality *Legal; 769 770 /// The profitablity analysis. 771 LoopVectorizationCostModel *Cost; 772 773 // Record whether runtime checks are added. 774 bool AddedSafetyChecks = false; 775 776 // Holds the end values for each induction variable. We save the end values 777 // so we can later fix-up the external users of the induction variables. 778 DenseMap<PHINode *, Value *> IVEndValues; 779 780 /// BFI and PSI are used to check for profile guided size optimizations. 781 BlockFrequencyInfo *BFI; 782 ProfileSummaryInfo *PSI; 783 784 // Whether this loop should be optimized for size based on profile guided size 785 // optimizatios. 786 bool OptForSizeBasedOnProfile; 787 788 /// Structure to hold information about generated runtime checks, responsible 789 /// for cleaning the checks, if vectorization turns out unprofitable. 790 GeneratedRTChecks &RTChecks; 791 792 // Holds the resume values for reductions in the loops, used to set the 793 // correct start value of reduction PHIs when vectorizing the epilogue. 794 SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4> 795 ReductionResumeValues; 796 }; 797 798 class InnerLoopUnroller : public InnerLoopVectorizer { 799 public: 800 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 801 LoopInfo *LI, DominatorTree *DT, 802 const TargetLibraryInfo *TLI, 803 const TargetTransformInfo *TTI, AssumptionCache *AC, 804 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 805 LoopVectorizationLegality *LVL, 806 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 807 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 808 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 809 ElementCount::getFixed(1), 810 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 811 BFI, PSI, Check) {} 812 }; 813 814 /// Encapsulate information regarding vectorization of a loop and its epilogue. 815 /// This information is meant to be updated and used across two stages of 816 /// epilogue vectorization. 817 struct EpilogueLoopVectorizationInfo { 818 ElementCount MainLoopVF = ElementCount::getFixed(0); 819 unsigned MainLoopUF = 0; 820 ElementCount EpilogueVF = ElementCount::getFixed(0); 821 unsigned EpilogueUF = 0; 822 BasicBlock *MainLoopIterationCountCheck = nullptr; 823 BasicBlock *EpilogueIterationCountCheck = nullptr; 824 BasicBlock *SCEVSafetyCheck = nullptr; 825 BasicBlock *MemSafetyCheck = nullptr; 826 Value *TripCount = nullptr; 827 Value *VectorTripCount = nullptr; 828 829 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, 830 ElementCount EVF, unsigned EUF) 831 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) { 832 assert(EUF == 1 && 833 "A high UF for the epilogue loop is likely not beneficial."); 834 } 835 }; 836 837 /// An extension of the inner loop vectorizer that creates a skeleton for a 838 /// vectorized loop that has its epilogue (residual) also vectorized. 839 /// The idea is to run the vplan on a given loop twice, firstly to setup the 840 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 841 /// from the first step and vectorize the epilogue. This is achieved by 842 /// deriving two concrete strategy classes from this base class and invoking 843 /// them in succession from the loop vectorizer planner. 844 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 845 public: 846 InnerLoopAndEpilogueVectorizer( 847 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 848 DominatorTree *DT, const TargetLibraryInfo *TLI, 849 const TargetTransformInfo *TTI, AssumptionCache *AC, 850 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 851 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 852 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 853 GeneratedRTChecks &Checks) 854 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 855 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL, 856 CM, BFI, PSI, Checks), 857 EPI(EPI) {} 858 859 // Override this function to handle the more complex control flow around the 860 // three loops. 861 std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton( 862 const SCEV2ValueTy &ExpandedSCEVs) final { 863 return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs); 864 } 865 866 /// The interface for creating a vectorized skeleton using one of two 867 /// different strategies, each corresponding to one execution of the vplan 868 /// as described above. 869 virtual std::pair<BasicBlock *, Value *> 870 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0; 871 872 /// Holds and updates state information required to vectorize the main loop 873 /// and its epilogue in two separate passes. This setup helps us avoid 874 /// regenerating and recomputing runtime safety checks. It also helps us to 875 /// shorten the iteration-count-check path length for the cases where the 876 /// iteration count of the loop is so small that the main vector loop is 877 /// completely skipped. 878 EpilogueLoopVectorizationInfo &EPI; 879 }; 880 881 /// A specialized derived class of inner loop vectorizer that performs 882 /// vectorization of *main* loops in the process of vectorizing loops and their 883 /// epilogues. 884 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 885 public: 886 EpilogueVectorizerMainLoop( 887 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 888 DominatorTree *DT, const TargetLibraryInfo *TLI, 889 const TargetTransformInfo *TTI, AssumptionCache *AC, 890 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 891 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 892 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 893 GeneratedRTChecks &Check) 894 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 895 EPI, LVL, CM, BFI, PSI, Check) {} 896 /// Implements the interface for creating a vectorized skeleton using the 897 /// *main loop* strategy (ie the first pass of vplan execution). 898 std::pair<BasicBlock *, Value *> 899 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final; 900 901 protected: 902 /// Emits an iteration count bypass check once for the main loop (when \p 903 /// ForEpilogue is false) and once for the epilogue loop (when \p 904 /// ForEpilogue is true). 905 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue); 906 void printDebugTracesAtStart() override; 907 void printDebugTracesAtEnd() override; 908 }; 909 910 // A specialized derived class of inner loop vectorizer that performs 911 // vectorization of *epilogue* loops in the process of vectorizing loops and 912 // their epilogues. 913 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 914 public: 915 EpilogueVectorizerEpilogueLoop( 916 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 917 DominatorTree *DT, const TargetLibraryInfo *TLI, 918 const TargetTransformInfo *TTI, AssumptionCache *AC, 919 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 920 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 921 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 922 GeneratedRTChecks &Checks) 923 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 924 EPI, LVL, CM, BFI, PSI, Checks) { 925 TripCount = EPI.TripCount; 926 } 927 /// Implements the interface for creating a vectorized skeleton using the 928 /// *epilogue loop* strategy (ie the second pass of vplan execution). 929 std::pair<BasicBlock *, Value *> 930 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final; 931 932 protected: 933 /// Emits an iteration count bypass check after the main vector loop has 934 /// finished to see if there are any iterations left to execute by either 935 /// the vector epilogue or the scalar epilogue. 936 BasicBlock *emitMinimumVectorEpilogueIterCountCheck( 937 BasicBlock *Bypass, 938 BasicBlock *Insert); 939 void printDebugTracesAtStart() override; 940 void printDebugTracesAtEnd() override; 941 }; 942 } // end namespace llvm 943 944 /// Look for a meaningful debug location on the instruction or it's 945 /// operands. 946 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 947 if (!I) 948 return I; 949 950 DebugLoc Empty; 951 if (I->getDebugLoc() != Empty) 952 return I; 953 954 for (Use &Op : I->operands()) { 955 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 956 if (OpInst->getDebugLoc() != Empty) 957 return OpInst; 958 } 959 960 return I; 961 } 962 963 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 964 /// is passed, the message relates to that particular instruction. 965 #ifndef NDEBUG 966 static void debugVectorizationMessage(const StringRef Prefix, 967 const StringRef DebugMsg, 968 Instruction *I) { 969 dbgs() << "LV: " << Prefix << DebugMsg; 970 if (I != nullptr) 971 dbgs() << " " << *I; 972 else 973 dbgs() << '.'; 974 dbgs() << '\n'; 975 } 976 #endif 977 978 /// Create an analysis remark that explains why vectorization failed 979 /// 980 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 981 /// RemarkName is the identifier for the remark. If \p I is passed it is an 982 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 983 /// the location of the remark. \return the remark object that can be 984 /// streamed to. 985 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 986 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 987 Value *CodeRegion = TheLoop->getHeader(); 988 DebugLoc DL = TheLoop->getStartLoc(); 989 990 if (I) { 991 CodeRegion = I->getParent(); 992 // If there is no debug location attached to the instruction, revert back to 993 // using the loop's. 994 if (I->getDebugLoc()) 995 DL = I->getDebugLoc(); 996 } 997 998 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 999 } 1000 1001 namespace llvm { 1002 1003 /// Return a value for Step multiplied by VF. 1004 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, 1005 int64_t Step) { 1006 assert(Ty->isIntegerTy() && "Expected an integer step"); 1007 return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step)); 1008 } 1009 1010 /// Return the runtime value for VF. 1011 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) { 1012 return B.CreateElementCount(Ty, VF); 1013 } 1014 1015 const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE, 1016 Loop *OrigLoop) { 1017 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 1018 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count"); 1019 1020 ScalarEvolution &SE = *PSE.getSE(); 1021 return SE.getTripCountFromExitCount(BackedgeTakenCount, IdxTy, OrigLoop); 1022 } 1023 1024 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy, 1025 ElementCount VF) { 1026 assert(FTy->isFloatingPointTy() && "Expected floating point type!"); 1027 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); 1028 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); 1029 return B.CreateUIToFP(RuntimeVF, FTy); 1030 } 1031 1032 void reportVectorizationFailure(const StringRef DebugMsg, 1033 const StringRef OREMsg, const StringRef ORETag, 1034 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1035 Instruction *I) { 1036 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 1037 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1038 ORE->emit( 1039 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1040 << "loop not vectorized: " << OREMsg); 1041 } 1042 1043 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 1044 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1045 Instruction *I) { 1046 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 1047 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1048 ORE->emit( 1049 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1050 << Msg); 1051 } 1052 1053 } // end namespace llvm 1054 1055 #ifndef NDEBUG 1056 /// \return string containing a file name and a line # for the given loop. 1057 static std::string getDebugLocString(const Loop *L) { 1058 std::string Result; 1059 if (L) { 1060 raw_string_ostream OS(Result); 1061 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1062 LoopDbgLoc.print(OS); 1063 else 1064 // Just print the module name. 1065 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1066 OS.flush(); 1067 } 1068 return Result; 1069 } 1070 #endif 1071 1072 void InnerLoopVectorizer::collectPoisonGeneratingRecipes( 1073 VPTransformState &State) { 1074 1075 // Collect recipes in the backward slice of `Root` that may generate a poison 1076 // value that is used after vectorization. 1077 SmallPtrSet<VPRecipeBase *, 16> Visited; 1078 auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) { 1079 SmallVector<VPRecipeBase *, 16> Worklist; 1080 Worklist.push_back(Root); 1081 1082 // Traverse the backward slice of Root through its use-def chain. 1083 while (!Worklist.empty()) { 1084 VPRecipeBase *CurRec = Worklist.back(); 1085 Worklist.pop_back(); 1086 1087 if (!Visited.insert(CurRec).second) 1088 continue; 1089 1090 // Prune search if we find another recipe generating a widen memory 1091 // instruction. Widen memory instructions involved in address computation 1092 // will lead to gather/scatter instructions, which don't need to be 1093 // handled. 1094 if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || 1095 isa<VPInterleaveRecipe>(CurRec) || 1096 isa<VPScalarIVStepsRecipe>(CurRec) || 1097 isa<VPCanonicalIVPHIRecipe>(CurRec) || 1098 isa<VPActiveLaneMaskPHIRecipe>(CurRec)) 1099 continue; 1100 1101 // This recipe contributes to the address computation of a widen 1102 // load/store. If the underlying instruction has poison-generating flags, 1103 // drop them directly. 1104 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) { 1105 RecWithFlags->dropPoisonGeneratingFlags(); 1106 } else { 1107 Instruction *Instr = CurRec->getUnderlyingInstr(); 1108 (void)Instr; 1109 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) && 1110 "found instruction with poison generating flags not covered by " 1111 "VPRecipeWithIRFlags"); 1112 } 1113 1114 // Add new definitions to the worklist. 1115 for (VPValue *operand : CurRec->operands()) 1116 if (VPRecipeBase *OpDef = operand->getDefiningRecipe()) 1117 Worklist.push_back(OpDef); 1118 } 1119 }); 1120 1121 // Traverse all the recipes in the VPlan and collect the poison-generating 1122 // recipes in the backward slice starting at the address of a VPWidenRecipe or 1123 // VPInterleaveRecipe. 1124 auto Iter = vp_depth_first_deep(State.Plan->getEntry()); 1125 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 1126 for (VPRecipeBase &Recipe : *VPBB) { 1127 if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) { 1128 Instruction &UnderlyingInstr = WidenRec->getIngredient(); 1129 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe(); 1130 if (AddrDef && WidenRec->isConsecutive() && 1131 Legal->blockNeedsPredication(UnderlyingInstr.getParent())) 1132 collectPoisonGeneratingInstrsInBackwardSlice(AddrDef); 1133 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) { 1134 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe(); 1135 if (AddrDef) { 1136 // Check if any member of the interleave group needs predication. 1137 const InterleaveGroup<Instruction> *InterGroup = 1138 InterleaveRec->getInterleaveGroup(); 1139 bool NeedPredication = false; 1140 for (int I = 0, NumMembers = InterGroup->getNumMembers(); 1141 I < NumMembers; ++I) { 1142 Instruction *Member = InterGroup->getMember(I); 1143 if (Member) 1144 NeedPredication |= 1145 Legal->blockNeedsPredication(Member->getParent()); 1146 } 1147 1148 if (NeedPredication) 1149 collectPoisonGeneratingInstrsInBackwardSlice(AddrDef); 1150 } 1151 } 1152 } 1153 } 1154 } 1155 1156 PHINode *InnerLoopVectorizer::getReductionResumeValue( 1157 const RecurrenceDescriptor &RdxDesc) { 1158 auto It = ReductionResumeValues.find(&RdxDesc); 1159 assert(It != ReductionResumeValues.end() && 1160 "Expected to find a resume value for the reduction."); 1161 return It->second; 1162 } 1163 1164 namespace llvm { 1165 1166 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1167 // lowered. 1168 enum ScalarEpilogueLowering { 1169 1170 // The default: allowing scalar epilogues. 1171 CM_ScalarEpilogueAllowed, 1172 1173 // Vectorization with OptForSize: don't allow epilogues. 1174 CM_ScalarEpilogueNotAllowedOptSize, 1175 1176 // A special case of vectorisation with OptForSize: loops with a very small 1177 // trip count are considered for vectorization under OptForSize, thereby 1178 // making sure the cost of their loop body is dominant, free of runtime 1179 // guards and scalar iteration overheads. 1180 CM_ScalarEpilogueNotAllowedLowTripLoop, 1181 1182 // Loop hint predicate indicating an epilogue is undesired. 1183 CM_ScalarEpilogueNotNeededUsePredicate, 1184 1185 // Directive indicating we must either tail fold or not vectorize 1186 CM_ScalarEpilogueNotAllowedUsePredicate 1187 }; 1188 1189 using InstructionVFPair = std::pair<Instruction *, ElementCount>; 1190 1191 /// LoopVectorizationCostModel - estimates the expected speedups due to 1192 /// vectorization. 1193 /// In many cases vectorization is not profitable. This can happen because of 1194 /// a number of reasons. In this class we mainly attempt to predict the 1195 /// expected speedup/slowdowns due to the supported instruction set. We use the 1196 /// TargetTransformInfo to query the different backends for the cost of 1197 /// different operations. 1198 class LoopVectorizationCostModel { 1199 public: 1200 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1201 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1202 LoopVectorizationLegality *Legal, 1203 const TargetTransformInfo &TTI, 1204 const TargetLibraryInfo *TLI, DemandedBits *DB, 1205 AssumptionCache *AC, 1206 OptimizationRemarkEmitter *ORE, const Function *F, 1207 const LoopVectorizeHints *Hints, 1208 InterleavedAccessInfo &IAI) 1209 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1210 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1211 Hints(Hints), InterleaveInfo(IAI) {} 1212 1213 /// \return An upper bound for the vectorization factors (both fixed and 1214 /// scalable). If the factors are 0, vectorization and interleaving should be 1215 /// avoided up front. 1216 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 1217 1218 /// \return True if runtime checks are required for vectorization, and false 1219 /// otherwise. 1220 bool runtimeChecksRequired(); 1221 1222 /// Setup cost-based decisions for user vectorization factor. 1223 /// \return true if the UserVF is a feasible VF to be chosen. 1224 bool selectUserVectorizationFactor(ElementCount UserVF) { 1225 collectUniformsAndScalars(UserVF); 1226 collectInstsToScalarize(UserVF); 1227 return expectedCost(UserVF).first.isValid(); 1228 } 1229 1230 /// \return The size (in bits) of the smallest and widest types in the code 1231 /// that needs to be vectorized. We ignore values that remain scalar such as 1232 /// 64 bit loop indices. 1233 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1234 1235 /// \return The desired interleave count. 1236 /// If interleave count has been specified by metadata it will be returned. 1237 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1238 /// are the selected vectorization factor and the cost of the selected VF. 1239 unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost); 1240 1241 /// Memory access instruction may be vectorized in more than one way. 1242 /// Form of instruction after vectorization depends on cost. 1243 /// This function takes cost-based decisions for Load/Store instructions 1244 /// and collects them in a map. This decisions map is used for building 1245 /// the lists of loop-uniform and loop-scalar instructions. 1246 /// The calculated cost is saved with widening decision in order to 1247 /// avoid redundant calculations. 1248 void setCostBasedWideningDecision(ElementCount VF); 1249 1250 /// A struct that represents some properties of the register usage 1251 /// of a loop. 1252 struct RegisterUsage { 1253 /// Holds the number of loop invariant values that are used in the loop. 1254 /// The key is ClassID of target-provided register class. 1255 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1256 /// Holds the maximum number of concurrent live intervals in the loop. 1257 /// The key is ClassID of target-provided register class. 1258 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1259 }; 1260 1261 /// \return Returns information about the register usages of the loop for the 1262 /// given vectorization factors. 1263 SmallVector<RegisterUsage, 8> 1264 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1265 1266 /// Collect values we want to ignore in the cost model. 1267 void collectValuesToIgnore(); 1268 1269 /// Collect all element types in the loop for which widening is needed. 1270 void collectElementTypesForWidening(); 1271 1272 /// Split reductions into those that happen in the loop, and those that happen 1273 /// outside. In loop reductions are collected into InLoopReductionChains. 1274 void collectInLoopReductions(); 1275 1276 /// Returns true if we should use strict in-order reductions for the given 1277 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1278 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1279 /// of FP operations. 1280 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const { 1281 return !Hints->allowReordering() && RdxDesc.isOrdered(); 1282 } 1283 1284 /// \returns The smallest bitwidth each instruction can be represented with. 1285 /// The vector equivalents of these instructions should be truncated to this 1286 /// type. 1287 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1288 return MinBWs; 1289 } 1290 1291 /// \returns True if it is more profitable to scalarize instruction \p I for 1292 /// vectorization factor \p VF. 1293 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1294 assert(VF.isVector() && 1295 "Profitable to scalarize relevant only for VF > 1."); 1296 1297 // Cost model is not run in the VPlan-native path - return conservative 1298 // result until this changes. 1299 if (EnableVPlanNativePath) 1300 return false; 1301 1302 auto Scalars = InstsToScalarize.find(VF); 1303 assert(Scalars != InstsToScalarize.end() && 1304 "VF not yet analyzed for scalarization profitability"); 1305 return Scalars->second.contains(I); 1306 } 1307 1308 /// Returns true if \p I is known to be uniform after vectorization. 1309 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1310 // Pseudo probe needs to be duplicated for each unrolled iteration and 1311 // vector lane so that profiled loop trip count can be accurately 1312 // accumulated instead of being under counted. 1313 if (isa<PseudoProbeInst>(I)) 1314 return false; 1315 1316 if (VF.isScalar()) 1317 return true; 1318 1319 // Cost model is not run in the VPlan-native path - return conservative 1320 // result until this changes. 1321 if (EnableVPlanNativePath) 1322 return false; 1323 1324 auto UniformsPerVF = Uniforms.find(VF); 1325 assert(UniformsPerVF != Uniforms.end() && 1326 "VF not yet analyzed for uniformity"); 1327 return UniformsPerVF->second.count(I); 1328 } 1329 1330 /// Returns true if \p I is known to be scalar after vectorization. 1331 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1332 if (VF.isScalar()) 1333 return true; 1334 1335 // Cost model is not run in the VPlan-native path - return conservative 1336 // result until this changes. 1337 if (EnableVPlanNativePath) 1338 return false; 1339 1340 auto ScalarsPerVF = Scalars.find(VF); 1341 assert(ScalarsPerVF != Scalars.end() && 1342 "Scalar values are not calculated for VF"); 1343 return ScalarsPerVF->second.count(I); 1344 } 1345 1346 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1347 /// for vectorization factor \p VF. 1348 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1349 return VF.isVector() && MinBWs.contains(I) && 1350 !isProfitableToScalarize(I, VF) && 1351 !isScalarAfterVectorization(I, VF); 1352 } 1353 1354 /// Decision that was taken during cost calculation for memory instruction. 1355 enum InstWidening { 1356 CM_Unknown, 1357 CM_Widen, // For consecutive accesses with stride +1. 1358 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1359 CM_Interleave, 1360 CM_GatherScatter, 1361 CM_Scalarize 1362 }; 1363 1364 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1365 /// instruction \p I and vector width \p VF. 1366 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1367 InstructionCost Cost) { 1368 assert(VF.isVector() && "Expected VF >=2"); 1369 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1370 } 1371 1372 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1373 /// interleaving group \p Grp and vector width \p VF. 1374 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1375 ElementCount VF, InstWidening W, 1376 InstructionCost Cost) { 1377 assert(VF.isVector() && "Expected VF >=2"); 1378 /// Broadcast this decicion to all instructions inside the group. 1379 /// But the cost will be assigned to one instruction only. 1380 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1381 if (auto *I = Grp->getMember(i)) { 1382 if (Grp->getInsertPos() == I) 1383 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1384 else 1385 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1386 } 1387 } 1388 } 1389 1390 /// Return the cost model decision for the given instruction \p I and vector 1391 /// width \p VF. Return CM_Unknown if this instruction did not pass 1392 /// through the cost modeling. 1393 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1394 assert(VF.isVector() && "Expected VF to be a vector VF"); 1395 // Cost model is not run in the VPlan-native path - return conservative 1396 // result until this changes. 1397 if (EnableVPlanNativePath) 1398 return CM_GatherScatter; 1399 1400 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1401 auto Itr = WideningDecisions.find(InstOnVF); 1402 if (Itr == WideningDecisions.end()) 1403 return CM_Unknown; 1404 return Itr->second.first; 1405 } 1406 1407 /// Return the vectorization cost for the given instruction \p I and vector 1408 /// width \p VF. 1409 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1410 assert(VF.isVector() && "Expected VF >=2"); 1411 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1412 assert(WideningDecisions.contains(InstOnVF) && 1413 "The cost is not calculated"); 1414 return WideningDecisions[InstOnVF].second; 1415 } 1416 1417 /// Return True if instruction \p I is an optimizable truncate whose operand 1418 /// is an induction variable. Such a truncate will be removed by adding a new 1419 /// induction variable with the destination type. 1420 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1421 // If the instruction is not a truncate, return false. 1422 auto *Trunc = dyn_cast<TruncInst>(I); 1423 if (!Trunc) 1424 return false; 1425 1426 // Get the source and destination types of the truncate. 1427 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1428 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1429 1430 // If the truncate is free for the given types, return false. Replacing a 1431 // free truncate with an induction variable would add an induction variable 1432 // update instruction to each iteration of the loop. We exclude from this 1433 // check the primary induction variable since it will need an update 1434 // instruction regardless. 1435 Value *Op = Trunc->getOperand(0); 1436 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1437 return false; 1438 1439 // If the truncated value is not an induction variable, return false. 1440 return Legal->isInductionPhi(Op); 1441 } 1442 1443 /// Collects the instructions to scalarize for each predicated instruction in 1444 /// the loop. 1445 void collectInstsToScalarize(ElementCount VF); 1446 1447 /// Collect Uniform and Scalar values for the given \p VF. 1448 /// The sets depend on CM decision for Load/Store instructions 1449 /// that may be vectorized as interleave, gather-scatter or scalarized. 1450 void collectUniformsAndScalars(ElementCount VF) { 1451 // Do the analysis once. 1452 if (VF.isScalar() || Uniforms.contains(VF)) 1453 return; 1454 setCostBasedWideningDecision(VF); 1455 collectLoopUniforms(VF); 1456 collectLoopScalars(VF); 1457 } 1458 1459 /// Returns true if the target machine supports masked store operation 1460 /// for the given \p DataType and kind of access to \p Ptr. 1461 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1462 return Legal->isConsecutivePtr(DataType, Ptr) && 1463 TTI.isLegalMaskedStore(DataType, Alignment); 1464 } 1465 1466 /// Returns true if the target machine supports masked load operation 1467 /// for the given \p DataType and kind of access to \p Ptr. 1468 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1469 return Legal->isConsecutivePtr(DataType, Ptr) && 1470 TTI.isLegalMaskedLoad(DataType, Alignment); 1471 } 1472 1473 /// Returns true if the target machine can represent \p V as a masked gather 1474 /// or scatter operation. 1475 bool isLegalGatherOrScatter(Value *V, ElementCount VF) { 1476 bool LI = isa<LoadInst>(V); 1477 bool SI = isa<StoreInst>(V); 1478 if (!LI && !SI) 1479 return false; 1480 auto *Ty = getLoadStoreType(V); 1481 Align Align = getLoadStoreAlignment(V); 1482 if (VF.isVector()) 1483 Ty = VectorType::get(Ty, VF); 1484 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1485 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1486 } 1487 1488 /// Returns true if the target machine supports all of the reduction 1489 /// variables found for the given VF. 1490 bool canVectorizeReductions(ElementCount VF) const { 1491 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1492 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1493 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1494 })); 1495 } 1496 1497 /// Given costs for both strategies, return true if the scalar predication 1498 /// lowering should be used for div/rem. This incorporates an override 1499 /// option so it is not simply a cost comparison. 1500 bool isDivRemScalarWithPredication(InstructionCost ScalarCost, 1501 InstructionCost SafeDivisorCost) const { 1502 switch (ForceSafeDivisor) { 1503 case cl::BOU_UNSET: 1504 return ScalarCost < SafeDivisorCost; 1505 case cl::BOU_TRUE: 1506 return false; 1507 case cl::BOU_FALSE: 1508 return true; 1509 }; 1510 llvm_unreachable("impossible case value"); 1511 } 1512 1513 /// Returns true if \p I is an instruction which requires predication and 1514 /// for which our chosen predication strategy is scalarization (i.e. we 1515 /// don't have an alternate strategy such as masking available). 1516 /// \p VF is the vectorization factor that will be used to vectorize \p I. 1517 bool isScalarWithPredication(Instruction *I, ElementCount VF) const; 1518 1519 /// Returns true if \p I is an instruction that needs to be predicated 1520 /// at runtime. The result is independent of the predication mechanism. 1521 /// Superset of instructions that return true for isScalarWithPredication. 1522 bool isPredicatedInst(Instruction *I) const; 1523 1524 /// Return the costs for our two available strategies for lowering a 1525 /// div/rem operation which requires speculating at least one lane. 1526 /// First result is for scalarization (will be invalid for scalable 1527 /// vectors); second is for the safe-divisor strategy. 1528 std::pair<InstructionCost, InstructionCost> 1529 getDivRemSpeculationCost(Instruction *I, 1530 ElementCount VF) const; 1531 1532 /// Returns true if \p I is a memory instruction with consecutive memory 1533 /// access that can be widened. 1534 bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF); 1535 1536 /// Returns true if \p I is a memory instruction in an interleaved-group 1537 /// of memory accesses that can be vectorized with wide vector loads/stores 1538 /// and shuffles. 1539 bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF); 1540 1541 /// Check if \p Instr belongs to any interleaved access group. 1542 bool isAccessInterleaved(Instruction *Instr) { 1543 return InterleaveInfo.isInterleaved(Instr); 1544 } 1545 1546 /// Get the interleaved access group that \p Instr belongs to. 1547 const InterleaveGroup<Instruction> * 1548 getInterleavedAccessGroup(Instruction *Instr) { 1549 return InterleaveInfo.getInterleaveGroup(Instr); 1550 } 1551 1552 /// Returns true if we're required to use a scalar epilogue for at least 1553 /// the final iteration of the original loop. 1554 bool requiresScalarEpilogue(bool IsVectorizing) const { 1555 if (!isScalarEpilogueAllowed()) 1556 return false; 1557 // If we might exit from anywhere but the latch, must run the exiting 1558 // iteration in scalar form. 1559 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1560 return true; 1561 return IsVectorizing && InterleaveInfo.requiresScalarEpilogue(); 1562 } 1563 1564 /// Returns true if we're required to use a scalar epilogue for at least 1565 /// the final iteration of the original loop for all VFs in \p Range. 1566 /// A scalar epilogue must either be required for all VFs in \p Range or for 1567 /// none. 1568 bool requiresScalarEpilogue(VFRange Range) const { 1569 auto RequiresScalarEpilogue = [this](ElementCount VF) { 1570 return requiresScalarEpilogue(VF.isVector()); 1571 }; 1572 bool IsRequired = all_of(Range, RequiresScalarEpilogue); 1573 assert( 1574 (IsRequired || none_of(Range, RequiresScalarEpilogue)) && 1575 "all VFs in range must agree on whether a scalar epilogue is required"); 1576 return IsRequired; 1577 } 1578 1579 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1580 /// loop hint annotation. 1581 bool isScalarEpilogueAllowed() const { 1582 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1583 } 1584 1585 /// Returns the TailFoldingStyle that is best for the current loop. 1586 TailFoldingStyle 1587 getTailFoldingStyle(bool IVUpdateMayOverflow = true) const { 1588 if (!CanFoldTailByMasking) 1589 return TailFoldingStyle::None; 1590 1591 if (ForceTailFoldingStyle.getNumOccurrences()) 1592 return ForceTailFoldingStyle; 1593 1594 return TTI.getPreferredTailFoldingStyle(IVUpdateMayOverflow); 1595 } 1596 1597 /// Returns true if all loop blocks should be masked to fold tail loop. 1598 bool foldTailByMasking() const { 1599 return getTailFoldingStyle() != TailFoldingStyle::None; 1600 } 1601 1602 /// Returns true if the instructions in this block requires predication 1603 /// for any reason, e.g. because tail folding now requires a predicate 1604 /// or because the block in the original loop was predicated. 1605 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const { 1606 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1607 } 1608 1609 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1610 /// nodes to the chain of instructions representing the reductions. Uses a 1611 /// MapVector to ensure deterministic iteration order. 1612 using ReductionChainMap = 1613 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1614 1615 /// Return the chain of instructions representing an inloop reduction. 1616 const ReductionChainMap &getInLoopReductionChains() const { 1617 return InLoopReductionChains; 1618 } 1619 1620 /// Returns true if the Phi is part of an inloop reduction. 1621 bool isInLoopReduction(PHINode *Phi) const { 1622 return InLoopReductionChains.count(Phi); 1623 } 1624 1625 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1626 /// with factor VF. Return the cost of the instruction, including 1627 /// scalarization overhead if it's needed. 1628 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1629 1630 /// Estimate cost of a call instruction CI if it were vectorized with factor 1631 /// VF. Return the cost of the instruction, including scalarization overhead 1632 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1633 /// scalarized - 1634 /// i.e. either vector version isn't available, or is too expensive. 1635 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1636 Function **Variant, 1637 bool *NeedsMask = nullptr) const; 1638 1639 /// Invalidates decisions already taken by the cost model. 1640 void invalidateCostModelingDecisions() { 1641 WideningDecisions.clear(); 1642 Uniforms.clear(); 1643 Scalars.clear(); 1644 } 1645 1646 /// The vectorization cost is a combination of the cost itself and a boolean 1647 /// indicating whether any of the contributing operations will actually 1648 /// operate on vector values after type legalization in the backend. If this 1649 /// latter value is false, then all operations will be scalarized (i.e. no 1650 /// vectorization has actually taken place). 1651 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1652 1653 /// Returns the expected execution cost. The unit of the cost does 1654 /// not matter because we use the 'cost' units to compare different 1655 /// vector widths. The cost that is returned is *not* normalized by 1656 /// the factor width. If \p Invalid is not nullptr, this function 1657 /// will add a pair(Instruction*, ElementCount) to \p Invalid for 1658 /// each instruction that has an Invalid cost for the given VF. 1659 VectorizationCostTy 1660 expectedCost(ElementCount VF, 1661 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); 1662 1663 bool hasPredStores() const { return NumPredStores > 0; } 1664 1665 /// Returns true if epilogue vectorization is considered profitable, and 1666 /// false otherwise. 1667 /// \p VF is the vectorization factor chosen for the original loop. 1668 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1669 1670 private: 1671 unsigned NumPredStores = 0; 1672 1673 /// \return An upper bound for the vectorization factors for both 1674 /// fixed and scalable vectorization, where the minimum-known number of 1675 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1676 /// disabled or unsupported, then the scalable part will be equal to 1677 /// ElementCount::getScalable(0). 1678 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, 1679 ElementCount UserVF, 1680 bool FoldTailByMasking); 1681 1682 /// \return the maximized element count based on the targets vector 1683 /// registers and the loop trip-count, but limited to a maximum safe VF. 1684 /// This is a helper function of computeFeasibleMaxVF. 1685 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, 1686 unsigned SmallestType, 1687 unsigned WidestType, 1688 ElementCount MaxSafeVF, 1689 bool FoldTailByMasking); 1690 1691 /// \return the maximum legal scalable VF, based on the safe max number 1692 /// of elements. 1693 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1694 1695 /// Returns the execution time cost of an instruction for a given vector 1696 /// width. Vector width of one means scalar. 1697 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1698 1699 /// The cost-computation logic from getInstructionCost which provides 1700 /// the vector type as an output parameter. 1701 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1702 Type *&VectorTy); 1703 1704 /// Return the cost of instructions in an inloop reduction pattern, if I is 1705 /// part of that pattern. 1706 std::optional<InstructionCost> 1707 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, 1708 TTI::TargetCostKind CostKind); 1709 1710 /// Calculate vectorization cost of memory instruction \p I. 1711 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1712 1713 /// The cost computation for scalarized memory instruction. 1714 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1715 1716 /// The cost computation for interleaving group of memory instructions. 1717 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1718 1719 /// The cost computation for Gather/Scatter instruction. 1720 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1721 1722 /// The cost computation for widening instruction \p I with consecutive 1723 /// memory access. 1724 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1725 1726 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1727 /// Load: scalar load + broadcast. 1728 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1729 /// element) 1730 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1731 1732 /// Estimate the overhead of scalarizing an instruction. This is a 1733 /// convenience wrapper for the type-based getScalarizationOverhead API. 1734 InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF, 1735 TTI::TargetCostKind CostKind) const; 1736 1737 /// Returns true if an artificially high cost for emulated masked memrefs 1738 /// should be used. 1739 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF); 1740 1741 /// Map of scalar integer values to the smallest bitwidth they can be legally 1742 /// represented as. The vector equivalents of these values should be truncated 1743 /// to this type. 1744 MapVector<Instruction *, uint64_t> MinBWs; 1745 1746 /// A type representing the costs for instructions if they were to be 1747 /// scalarized rather than vectorized. The entries are Instruction-Cost 1748 /// pairs. 1749 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1750 1751 /// A set containing all BasicBlocks that are known to present after 1752 /// vectorization as a predicated block. 1753 DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>> 1754 PredicatedBBsAfterVectorization; 1755 1756 /// Records whether it is allowed to have the original scalar loop execute at 1757 /// least once. This may be needed as a fallback loop in case runtime 1758 /// aliasing/dependence checks fail, or to handle the tail/remainder 1759 /// iterations when the trip count is unknown or doesn't divide by the VF, 1760 /// or as a peel-loop to handle gaps in interleave-groups. 1761 /// Under optsize and when the trip count is very small we don't allow any 1762 /// iterations to execute in the scalar loop. 1763 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1764 1765 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1766 bool CanFoldTailByMasking = false; 1767 1768 /// A map holding scalar costs for different vectorization factors. The 1769 /// presence of a cost for an instruction in the mapping indicates that the 1770 /// instruction will be scalarized when vectorizing with the associated 1771 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1772 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1773 1774 /// Holds the instructions known to be uniform after vectorization. 1775 /// The data is collected per VF. 1776 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1777 1778 /// Holds the instructions known to be scalar after vectorization. 1779 /// The data is collected per VF. 1780 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1781 1782 /// Holds the instructions (address computations) that are forced to be 1783 /// scalarized. 1784 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1785 1786 /// PHINodes of the reductions that should be expanded in-loop along with 1787 /// their associated chains of reduction operations, in program order from top 1788 /// (PHI) to bottom 1789 ReductionChainMap InLoopReductionChains; 1790 1791 /// A Map of inloop reduction operations and their immediate chain operand. 1792 /// FIXME: This can be removed once reductions can be costed correctly in 1793 /// vplan. This was added to allow quick lookup to the inloop operations, 1794 /// without having to loop through InLoopReductionChains. 1795 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1796 1797 /// Returns the expected difference in cost from scalarizing the expression 1798 /// feeding a predicated instruction \p PredInst. The instructions to 1799 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1800 /// non-negative return value implies the expression will be scalarized. 1801 /// Currently, only single-use chains are considered for scalarization. 1802 InstructionCost computePredInstDiscount(Instruction *PredInst, 1803 ScalarCostsTy &ScalarCosts, 1804 ElementCount VF); 1805 1806 /// Collect the instructions that are uniform after vectorization. An 1807 /// instruction is uniform if we represent it with a single scalar value in 1808 /// the vectorized loop corresponding to each vector iteration. Examples of 1809 /// uniform instructions include pointer operands of consecutive or 1810 /// interleaved memory accesses. Note that although uniformity implies an 1811 /// instruction will be scalar, the reverse is not true. In general, a 1812 /// scalarized instruction will be represented by VF scalar values in the 1813 /// vectorized loop, each corresponding to an iteration of the original 1814 /// scalar loop. 1815 void collectLoopUniforms(ElementCount VF); 1816 1817 /// Collect the instructions that are scalar after vectorization. An 1818 /// instruction is scalar if it is known to be uniform or will be scalarized 1819 /// during vectorization. collectLoopScalars should only add non-uniform nodes 1820 /// to the list if they are used by a load/store instruction that is marked as 1821 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by 1822 /// VF values in the vectorized loop, each corresponding to an iteration of 1823 /// the original scalar loop. 1824 void collectLoopScalars(ElementCount VF); 1825 1826 /// Keeps cost model vectorization decision and cost for instructions. 1827 /// Right now it is used for memory instructions only. 1828 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1829 std::pair<InstWidening, InstructionCost>>; 1830 1831 DecisionList WideningDecisions; 1832 1833 /// Returns true if \p V is expected to be vectorized and it needs to be 1834 /// extracted. 1835 bool needsExtract(Value *V, ElementCount VF) const { 1836 Instruction *I = dyn_cast<Instruction>(V); 1837 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1838 TheLoop->isLoopInvariant(I)) 1839 return false; 1840 1841 // Assume we can vectorize V (and hence we need extraction) if the 1842 // scalars are not computed yet. This can happen, because it is called 1843 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1844 // the scalars are collected. That should be a safe assumption in most 1845 // cases, because we check if the operands have vectorizable types 1846 // beforehand in LoopVectorizationLegality. 1847 return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF); 1848 }; 1849 1850 /// Returns a range containing only operands needing to be extracted. 1851 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1852 ElementCount VF) const { 1853 return SmallVector<Value *, 4>(make_filter_range( 1854 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1855 } 1856 1857 public: 1858 /// The loop that we evaluate. 1859 Loop *TheLoop; 1860 1861 /// Predicated scalar evolution analysis. 1862 PredicatedScalarEvolution &PSE; 1863 1864 /// Loop Info analysis. 1865 LoopInfo *LI; 1866 1867 /// Vectorization legality. 1868 LoopVectorizationLegality *Legal; 1869 1870 /// Vector target information. 1871 const TargetTransformInfo &TTI; 1872 1873 /// Target Library Info. 1874 const TargetLibraryInfo *TLI; 1875 1876 /// Demanded bits analysis. 1877 DemandedBits *DB; 1878 1879 /// Assumption cache. 1880 AssumptionCache *AC; 1881 1882 /// Interface to emit optimization remarks. 1883 OptimizationRemarkEmitter *ORE; 1884 1885 const Function *TheFunction; 1886 1887 /// Loop Vectorize Hint. 1888 const LoopVectorizeHints *Hints; 1889 1890 /// The interleave access information contains groups of interleaved accesses 1891 /// with the same stride and close to each other. 1892 InterleavedAccessInfo &InterleaveInfo; 1893 1894 /// Values to ignore in the cost model. 1895 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1896 1897 /// Values to ignore in the cost model when VF > 1. 1898 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1899 1900 /// All element types found in the loop. 1901 SmallPtrSet<Type *, 16> ElementTypesInLoop; 1902 }; 1903 } // end namespace llvm 1904 1905 namespace { 1906 /// Helper struct to manage generating runtime checks for vectorization. 1907 /// 1908 /// The runtime checks are created up-front in temporary blocks to allow better 1909 /// estimating the cost and un-linked from the existing IR. After deciding to 1910 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1911 /// temporary blocks are completely removed. 1912 class GeneratedRTChecks { 1913 /// Basic block which contains the generated SCEV checks, if any. 1914 BasicBlock *SCEVCheckBlock = nullptr; 1915 1916 /// The value representing the result of the generated SCEV checks. If it is 1917 /// nullptr, either no SCEV checks have been generated or they have been used. 1918 Value *SCEVCheckCond = nullptr; 1919 1920 /// Basic block which contains the generated memory runtime checks, if any. 1921 BasicBlock *MemCheckBlock = nullptr; 1922 1923 /// The value representing the result of the generated memory runtime checks. 1924 /// If it is nullptr, either no memory runtime checks have been generated or 1925 /// they have been used. 1926 Value *MemRuntimeCheckCond = nullptr; 1927 1928 DominatorTree *DT; 1929 LoopInfo *LI; 1930 TargetTransformInfo *TTI; 1931 1932 SCEVExpander SCEVExp; 1933 SCEVExpander MemCheckExp; 1934 1935 bool CostTooHigh = false; 1936 1937 public: 1938 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1939 TargetTransformInfo *TTI, const DataLayout &DL) 1940 : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"), 1941 MemCheckExp(SE, DL, "scev.check") {} 1942 1943 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1944 /// accurately estimate the cost of the runtime checks. The blocks are 1945 /// un-linked from the IR and is added back during vector code generation. If 1946 /// there is no vector code generation, the check blocks are removed 1947 /// completely. 1948 void Create(Loop *L, const LoopAccessInfo &LAI, 1949 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) { 1950 1951 // Hard cutoff to limit compile-time increase in case a very large number of 1952 // runtime checks needs to be generated. 1953 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to 1954 // profile info. 1955 CostTooHigh = 1956 LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold; 1957 if (CostTooHigh) 1958 return; 1959 1960 BasicBlock *LoopHeader = L->getHeader(); 1961 BasicBlock *Preheader = L->getLoopPreheader(); 1962 1963 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1964 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1965 // may be used by SCEVExpander. The blocks will be un-linked from their 1966 // predecessors and removed from LI & DT at the end of the function. 1967 if (!UnionPred.isAlwaysTrue()) { 1968 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1969 nullptr, "vector.scevcheck"); 1970 1971 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1972 &UnionPred, SCEVCheckBlock->getTerminator()); 1973 } 1974 1975 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 1976 if (RtPtrChecking.Need) { 1977 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 1978 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 1979 "vector.memcheck"); 1980 1981 auto DiffChecks = RtPtrChecking.getDiffChecks(); 1982 if (DiffChecks) { 1983 Value *RuntimeVF = nullptr; 1984 MemRuntimeCheckCond = addDiffRuntimeChecks( 1985 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp, 1986 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) { 1987 if (!RuntimeVF) 1988 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF); 1989 return RuntimeVF; 1990 }, 1991 IC); 1992 } else { 1993 MemRuntimeCheckCond = 1994 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 1995 RtPtrChecking.getChecks(), MemCheckExp); 1996 } 1997 assert(MemRuntimeCheckCond && 1998 "no RT checks generated although RtPtrChecking " 1999 "claimed checks are required"); 2000 } 2001 2002 if (!MemCheckBlock && !SCEVCheckBlock) 2003 return; 2004 2005 // Unhook the temporary block with the checks, update various places 2006 // accordingly. 2007 if (SCEVCheckBlock) 2008 SCEVCheckBlock->replaceAllUsesWith(Preheader); 2009 if (MemCheckBlock) 2010 MemCheckBlock->replaceAllUsesWith(Preheader); 2011 2012 if (SCEVCheckBlock) { 2013 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2014 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 2015 Preheader->getTerminator()->eraseFromParent(); 2016 } 2017 if (MemCheckBlock) { 2018 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2019 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 2020 Preheader->getTerminator()->eraseFromParent(); 2021 } 2022 2023 DT->changeImmediateDominator(LoopHeader, Preheader); 2024 if (MemCheckBlock) { 2025 DT->eraseNode(MemCheckBlock); 2026 LI->removeBlock(MemCheckBlock); 2027 } 2028 if (SCEVCheckBlock) { 2029 DT->eraseNode(SCEVCheckBlock); 2030 LI->removeBlock(SCEVCheckBlock); 2031 } 2032 } 2033 2034 InstructionCost getCost() { 2035 if (SCEVCheckBlock || MemCheckBlock) 2036 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n"); 2037 2038 if (CostTooHigh) { 2039 InstructionCost Cost; 2040 Cost.setInvalid(); 2041 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n"); 2042 return Cost; 2043 } 2044 2045 InstructionCost RTCheckCost = 0; 2046 if (SCEVCheckBlock) 2047 for (Instruction &I : *SCEVCheckBlock) { 2048 if (SCEVCheckBlock->getTerminator() == &I) 2049 continue; 2050 InstructionCost C = 2051 TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); 2052 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); 2053 RTCheckCost += C; 2054 } 2055 if (MemCheckBlock) 2056 for (Instruction &I : *MemCheckBlock) { 2057 if (MemCheckBlock->getTerminator() == &I) 2058 continue; 2059 InstructionCost C = 2060 TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); 2061 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); 2062 RTCheckCost += C; 2063 } 2064 2065 if (SCEVCheckBlock || MemCheckBlock) 2066 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost 2067 << "\n"); 2068 2069 return RTCheckCost; 2070 } 2071 2072 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2073 /// unused. 2074 ~GeneratedRTChecks() { 2075 SCEVExpanderCleaner SCEVCleaner(SCEVExp); 2076 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp); 2077 if (!SCEVCheckCond) 2078 SCEVCleaner.markResultUsed(); 2079 2080 if (!MemRuntimeCheckCond) 2081 MemCheckCleaner.markResultUsed(); 2082 2083 if (MemRuntimeCheckCond) { 2084 auto &SE = *MemCheckExp.getSE(); 2085 // Memory runtime check generation creates compares that use expanded 2086 // values. Remove them before running the SCEVExpanderCleaners. 2087 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2088 if (MemCheckExp.isInsertedInstruction(&I)) 2089 continue; 2090 SE.forgetValue(&I); 2091 I.eraseFromParent(); 2092 } 2093 } 2094 MemCheckCleaner.cleanup(); 2095 SCEVCleaner.cleanup(); 2096 2097 if (SCEVCheckCond) 2098 SCEVCheckBlock->eraseFromParent(); 2099 if (MemRuntimeCheckCond) 2100 MemCheckBlock->eraseFromParent(); 2101 } 2102 2103 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2104 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2105 /// depending on the generated condition. 2106 BasicBlock *emitSCEVChecks(BasicBlock *Bypass, 2107 BasicBlock *LoopVectorPreHeader, 2108 BasicBlock *LoopExitBlock) { 2109 if (!SCEVCheckCond) 2110 return nullptr; 2111 2112 Value *Cond = SCEVCheckCond; 2113 // Mark the check as used, to prevent it from being removed during cleanup. 2114 SCEVCheckCond = nullptr; 2115 if (auto *C = dyn_cast<ConstantInt>(Cond)) 2116 if (C->isZero()) 2117 return nullptr; 2118 2119 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2120 2121 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2122 // Create new preheader for vector loop. 2123 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2124 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2125 2126 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2127 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2128 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2129 SCEVCheckBlock); 2130 2131 DT->addNewBlock(SCEVCheckBlock, Pred); 2132 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2133 2134 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), 2135 BranchInst::Create(Bypass, LoopVectorPreHeader, Cond)); 2136 return SCEVCheckBlock; 2137 } 2138 2139 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2140 /// the branches to branch to the vector preheader or \p Bypass, depending on 2141 /// the generated condition. 2142 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass, 2143 BasicBlock *LoopVectorPreHeader) { 2144 // Check if we generated code that checks in runtime if arrays overlap. 2145 if (!MemRuntimeCheckCond) 2146 return nullptr; 2147 2148 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2149 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2150 MemCheckBlock); 2151 2152 DT->addNewBlock(MemCheckBlock, Pred); 2153 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2154 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2155 2156 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2157 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2158 2159 ReplaceInstWithInst( 2160 MemCheckBlock->getTerminator(), 2161 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2162 MemCheckBlock->getTerminator()->setDebugLoc( 2163 Pred->getTerminator()->getDebugLoc()); 2164 2165 // Mark the check as used, to prevent it from being removed during cleanup. 2166 MemRuntimeCheckCond = nullptr; 2167 return MemCheckBlock; 2168 } 2169 }; 2170 } // namespace 2171 2172 static bool useActiveLaneMask(TailFoldingStyle Style) { 2173 return Style == TailFoldingStyle::Data || 2174 Style == TailFoldingStyle::DataAndControlFlow || 2175 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; 2176 } 2177 2178 static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) { 2179 return Style == TailFoldingStyle::DataAndControlFlow || 2180 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; 2181 } 2182 2183 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2184 // vectorization. The loop needs to be annotated with #pragma omp simd 2185 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2186 // vector length information is not provided, vectorization is not considered 2187 // explicit. Interleave hints are not allowed either. These limitations will be 2188 // relaxed in the future. 2189 // Please, note that we are currently forced to abuse the pragma 'clang 2190 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2191 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2192 // provides *explicit vectorization hints* (LV can bypass legal checks and 2193 // assume that vectorization is legal). However, both hints are implemented 2194 // using the same metadata (llvm.loop.vectorize, processed by 2195 // LoopVectorizeHints). This will be fixed in the future when the native IR 2196 // representation for pragma 'omp simd' is introduced. 2197 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2198 OptimizationRemarkEmitter *ORE) { 2199 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2200 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2201 2202 // Only outer loops with an explicit vectorization hint are supported. 2203 // Unannotated outer loops are ignored. 2204 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2205 return false; 2206 2207 Function *Fn = OuterLp->getHeader()->getParent(); 2208 if (!Hints.allowVectorization(Fn, OuterLp, 2209 true /*VectorizeOnlyWhenForced*/)) { 2210 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2211 return false; 2212 } 2213 2214 if (Hints.getInterleave() > 1) { 2215 // TODO: Interleave support is future work. 2216 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2217 "outer loops.\n"); 2218 Hints.emitRemarkWithHints(); 2219 return false; 2220 } 2221 2222 return true; 2223 } 2224 2225 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2226 OptimizationRemarkEmitter *ORE, 2227 SmallVectorImpl<Loop *> &V) { 2228 // Collect inner loops and outer loops without irreducible control flow. For 2229 // now, only collect outer loops that have explicit vectorization hints. If we 2230 // are stress testing the VPlan H-CFG construction, we collect the outermost 2231 // loop of every loop nest. 2232 if (L.isInnermost() || VPlanBuildStressTest || 2233 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2234 LoopBlocksRPO RPOT(&L); 2235 RPOT.perform(LI); 2236 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2237 V.push_back(&L); 2238 // TODO: Collect inner loops inside marked outer loops in case 2239 // vectorization fails for the outer loop. Do not invoke 2240 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2241 // already known to be reducible. We can use an inherited attribute for 2242 // that. 2243 return; 2244 } 2245 } 2246 for (Loop *InnerL : L) 2247 collectSupportedLoops(*InnerL, LI, ORE, V); 2248 } 2249 2250 //===----------------------------------------------------------------------===// 2251 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2252 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2253 //===----------------------------------------------------------------------===// 2254 2255 /// This function adds 2256 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 2257 /// to each vector element of Val. The sequence starts at StartIndex. 2258 /// \p Opcode is relevant for FP induction variable. 2259 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, 2260 Instruction::BinaryOps BinOp, ElementCount VF, 2261 IRBuilderBase &Builder) { 2262 assert(VF.isVector() && "only vector VFs are supported"); 2263 2264 // Create and check the types. 2265 auto *ValVTy = cast<VectorType>(Val->getType()); 2266 ElementCount VLen = ValVTy->getElementCount(); 2267 2268 Type *STy = Val->getType()->getScalarType(); 2269 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2270 "Induction Step must be an integer or FP"); 2271 assert(Step->getType() == STy && "Step has wrong type"); 2272 2273 SmallVector<Constant *, 8> Indices; 2274 2275 // Create a vector of consecutive numbers from zero to VF. 2276 VectorType *InitVecValVTy = ValVTy; 2277 if (STy->isFloatingPointTy()) { 2278 Type *InitVecValSTy = 2279 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2280 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2281 } 2282 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2283 2284 // Splat the StartIdx 2285 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); 2286 2287 if (STy->isIntegerTy()) { 2288 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2289 Step = Builder.CreateVectorSplat(VLen, Step); 2290 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2291 // FIXME: The newly created binary instructions should contain nsw/nuw 2292 // flags, which can be found from the original scalar operations. 2293 Step = Builder.CreateMul(InitVec, Step); 2294 return Builder.CreateAdd(Val, Step, "induction"); 2295 } 2296 2297 // Floating point induction. 2298 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2299 "Binary Opcode should be specified for FP induction"); 2300 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2301 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); 2302 2303 Step = Builder.CreateVectorSplat(VLen, Step); 2304 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2305 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2306 } 2307 2308 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 2309 /// variable on which to base the steps, \p Step is the size of the step. 2310 static void buildScalarSteps(Value *ScalarIV, Value *Step, 2311 const InductionDescriptor &ID, VPValue *Def, 2312 VPTransformState &State) { 2313 IRBuilderBase &Builder = State.Builder; 2314 2315 // Ensure step has the same type as that of scalar IV. 2316 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2317 if (ScalarIVTy != Step->getType()) { 2318 // TODO: Also use VPDerivedIVRecipe when only the step needs truncating, to 2319 // avoid separate truncate here. 2320 assert(Step->getType()->isIntegerTy() && 2321 "Truncation requires an integer step"); 2322 Step = State.Builder.CreateTrunc(Step, ScalarIVTy); 2323 } 2324 2325 // We build scalar steps for both integer and floating-point induction 2326 // variables. Here, we determine the kind of arithmetic we will perform. 2327 Instruction::BinaryOps AddOp; 2328 Instruction::BinaryOps MulOp; 2329 if (ScalarIVTy->isIntegerTy()) { 2330 AddOp = Instruction::Add; 2331 MulOp = Instruction::Mul; 2332 } else { 2333 AddOp = ID.getInductionOpcode(); 2334 MulOp = Instruction::FMul; 2335 } 2336 2337 // Determine the number of scalars we need to generate for each unroll 2338 // iteration. 2339 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def); 2340 // Compute the scalar steps and save the results in State. 2341 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2342 ScalarIVTy->getScalarSizeInBits()); 2343 Type *VecIVTy = nullptr; 2344 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2345 if (!FirstLaneOnly && State.VF.isScalable()) { 2346 VecIVTy = VectorType::get(ScalarIVTy, State.VF); 2347 UnitStepVec = 2348 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); 2349 SplatStep = Builder.CreateVectorSplat(State.VF, Step); 2350 SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV); 2351 } 2352 2353 unsigned StartPart = 0; 2354 unsigned EndPart = State.UF; 2355 unsigned StartLane = 0; 2356 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); 2357 if (State.Instance) { 2358 StartPart = State.Instance->Part; 2359 EndPart = StartPart + 1; 2360 StartLane = State.Instance->Lane.getKnownLane(); 2361 EndLane = StartLane + 1; 2362 } 2363 for (unsigned Part = StartPart; Part < EndPart; ++Part) { 2364 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); 2365 2366 if (!FirstLaneOnly && State.VF.isScalable()) { 2367 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); 2368 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2369 if (ScalarIVTy->isFloatingPointTy()) 2370 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2371 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2372 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2373 State.set(Def, Add, Part); 2374 // It's useful to record the lane values too for the known minimum number 2375 // of elements so we do those below. This improves the code quality when 2376 // trying to extract the first element, for example. 2377 } 2378 2379 if (ScalarIVTy->isFloatingPointTy()) 2380 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2381 2382 for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) { 2383 Value *StartIdx = Builder.CreateBinOp( 2384 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2385 // The step returned by `createStepForVF` is a runtime-evaluated value 2386 // when VF is scalable. Otherwise, it should be folded into a Constant. 2387 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && 2388 "Expected StartIdx to be folded to a constant when VF is not " 2389 "scalable"); 2390 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2391 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2392 State.set(Def, Add, VPIteration(Part, Lane)); 2393 } 2394 } 2395 } 2396 2397 /// Compute the transformed value of Index at offset StartValue using step 2398 /// StepValue. 2399 /// For integer induction, returns StartValue + Index * StepValue. 2400 /// For pointer induction, returns StartValue[Index * StepValue]. 2401 /// FIXME: The newly created binary instructions should contain nsw/nuw 2402 /// flags, which can be found from the original scalar operations. 2403 static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index, 2404 Value *StartValue, Value *Step, 2405 const InductionDescriptor &ID) { 2406 Type *StepTy = Step->getType(); 2407 Value *CastedIndex = StepTy->isIntegerTy() 2408 ? B.CreateSExtOrTrunc(Index, StepTy) 2409 : B.CreateCast(Instruction::SIToFP, Index, StepTy); 2410 if (CastedIndex != Index) { 2411 CastedIndex->setName(CastedIndex->getName() + ".cast"); 2412 Index = CastedIndex; 2413 } 2414 2415 // Note: the IR at this point is broken. We cannot use SE to create any new 2416 // SCEV and then expand it, hoping that SCEV's simplification will give us 2417 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2418 // lead to various SCEV crashes. So all we can do is to use builder and rely 2419 // on InstCombine for future simplifications. Here we handle some trivial 2420 // cases only. 2421 auto CreateAdd = [&B](Value *X, Value *Y) { 2422 assert(X->getType() == Y->getType() && "Types don't match!"); 2423 if (auto *CX = dyn_cast<ConstantInt>(X)) 2424 if (CX->isZero()) 2425 return Y; 2426 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2427 if (CY->isZero()) 2428 return X; 2429 return B.CreateAdd(X, Y); 2430 }; 2431 2432 // We allow X to be a vector type, in which case Y will potentially be 2433 // splatted into a vector with the same element count. 2434 auto CreateMul = [&B](Value *X, Value *Y) { 2435 assert(X->getType()->getScalarType() == Y->getType() && 2436 "Types don't match!"); 2437 if (auto *CX = dyn_cast<ConstantInt>(X)) 2438 if (CX->isOne()) 2439 return Y; 2440 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2441 if (CY->isOne()) 2442 return X; 2443 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 2444 if (XVTy && !isa<VectorType>(Y->getType())) 2445 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 2446 return B.CreateMul(X, Y); 2447 }; 2448 2449 switch (ID.getKind()) { 2450 case InductionDescriptor::IK_IntInduction: { 2451 assert(!isa<VectorType>(Index->getType()) && 2452 "Vector indices not supported for integer inductions yet"); 2453 assert(Index->getType() == StartValue->getType() && 2454 "Index type does not match StartValue type"); 2455 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne()) 2456 return B.CreateSub(StartValue, Index); 2457 auto *Offset = CreateMul(Index, Step); 2458 return CreateAdd(StartValue, Offset); 2459 } 2460 case InductionDescriptor::IK_PtrInduction: { 2461 return B.CreateGEP(B.getInt8Ty(), StartValue, CreateMul(Index, Step)); 2462 } 2463 case InductionDescriptor::IK_FpInduction: { 2464 assert(!isa<VectorType>(Index->getType()) && 2465 "Vector indices not supported for FP inductions yet"); 2466 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2467 auto InductionBinOp = ID.getInductionBinOp(); 2468 assert(InductionBinOp && 2469 (InductionBinOp->getOpcode() == Instruction::FAdd || 2470 InductionBinOp->getOpcode() == Instruction::FSub) && 2471 "Original bin op should be defined for FP induction"); 2472 2473 Value *MulExp = B.CreateFMul(Step, Index); 2474 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2475 "induction"); 2476 } 2477 case InductionDescriptor::IK_NoInduction: 2478 return nullptr; 2479 } 2480 llvm_unreachable("invalid enum"); 2481 } 2482 2483 std::optional<unsigned> getMaxVScale(const Function &F, 2484 const TargetTransformInfo &TTI) { 2485 if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale()) 2486 return MaxVScale; 2487 2488 if (F.hasFnAttribute(Attribute::VScaleRange)) 2489 return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); 2490 2491 return std::nullopt; 2492 } 2493 2494 /// For the given VF and UF and maximum trip count computed for the loop, return 2495 /// whether the induction variable might overflow in the vectorized loop. If not, 2496 /// then we know a runtime overflow check always evaluates to false and can be 2497 /// removed. 2498 static bool isIndvarOverflowCheckKnownFalse( 2499 const LoopVectorizationCostModel *Cost, 2500 ElementCount VF, std::optional<unsigned> UF = std::nullopt) { 2501 // Always be conservative if we don't know the exact unroll factor. 2502 unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF); 2503 2504 Type *IdxTy = Cost->Legal->getWidestInductionType(); 2505 APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask(); 2506 2507 // We know the runtime overflow check is known false iff the (max) trip-count 2508 // is known and (max) trip-count + (VF * UF) does not overflow in the type of 2509 // the vector loop induction variable. 2510 if (unsigned TC = 2511 Cost->PSE.getSE()->getSmallConstantMaxTripCount(Cost->TheLoop)) { 2512 uint64_t MaxVF = VF.getKnownMinValue(); 2513 if (VF.isScalable()) { 2514 std::optional<unsigned> MaxVScale = 2515 getMaxVScale(*Cost->TheFunction, Cost->TTI); 2516 if (!MaxVScale) 2517 return false; 2518 MaxVF *= *MaxVScale; 2519 } 2520 2521 return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF); 2522 } 2523 2524 return false; 2525 } 2526 2527 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2528 const VPIteration &Instance, 2529 VPTransformState &State) { 2530 Value *ScalarInst = State.get(Def, Instance); 2531 Value *VectorValue = State.get(Def, Instance.Part); 2532 VectorValue = Builder.CreateInsertElement( 2533 VectorValue, ScalarInst, 2534 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2535 State.set(Def, VectorValue, Instance.Part); 2536 } 2537 2538 // Return whether we allow using masked interleave-groups (for dealing with 2539 // strided loads/stores that reside in predicated blocks, or for dealing 2540 // with gaps). 2541 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2542 // If an override option has been passed in for interleaved accesses, use it. 2543 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2544 return EnableMaskedInterleavedMemAccesses; 2545 2546 return TTI.enableMaskedInterleavedAccessVectorization(); 2547 } 2548 2549 // Try to vectorize the interleave group that \p Instr belongs to. 2550 // 2551 // E.g. Translate following interleaved load group (factor = 3): 2552 // for (i = 0; i < N; i+=3) { 2553 // R = Pic[i]; // Member of index 0 2554 // G = Pic[i+1]; // Member of index 1 2555 // B = Pic[i+2]; // Member of index 2 2556 // ... // do something to R, G, B 2557 // } 2558 // To: 2559 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2560 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2561 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2562 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2563 // 2564 // Or translate following interleaved store group (factor = 3): 2565 // for (i = 0; i < N; i+=3) { 2566 // ... do something to R, G, B 2567 // Pic[i] = R; // Member of index 0 2568 // Pic[i+1] = G; // Member of index 1 2569 // Pic[i+2] = B; // Member of index 2 2570 // } 2571 // To: 2572 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2573 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2574 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2575 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2576 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2577 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2578 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2579 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2580 VPValue *BlockInMask, bool NeedsMaskForGaps) { 2581 Instruction *Instr = Group->getInsertPos(); 2582 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2583 2584 // Prepare for the vector type of the interleaved load/store. 2585 Type *ScalarTy = getLoadStoreType(Instr); 2586 unsigned InterleaveFactor = Group->getFactor(); 2587 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2588 2589 // Prepare for the new pointers. 2590 SmallVector<Value *, 2> AddrParts; 2591 unsigned Index = Group->getIndex(Instr); 2592 2593 // TODO: extend the masked interleaved-group support to reversed access. 2594 assert((!BlockInMask || !Group->isReverse()) && 2595 "Reversed masked interleave-group not supported."); 2596 2597 Value *Idx; 2598 // If the group is reverse, adjust the index to refer to the last vector lane 2599 // instead of the first. We adjust the index from the first vector lane, 2600 // rather than directly getting the pointer for lane VF - 1, because the 2601 // pointer operand of the interleaved access is supposed to be uniform. For 2602 // uniform instructions, we're only required to generate a value for the 2603 // first vector lane in each unroll iteration. 2604 if (Group->isReverse()) { 2605 Value *RuntimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF); 2606 Idx = Builder.CreateSub(RuntimeVF, Builder.getInt32(1)); 2607 Idx = Builder.CreateMul(Idx, Builder.getInt32(Group->getFactor())); 2608 Idx = Builder.CreateAdd(Idx, Builder.getInt32(Index)); 2609 Idx = Builder.CreateNeg(Idx); 2610 } else 2611 Idx = Builder.getInt32(-Index); 2612 2613 for (unsigned Part = 0; Part < UF; Part++) { 2614 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2615 State.setDebugLocFromInst(AddrPart); 2616 2617 // Notice current instruction could be any index. Need to adjust the address 2618 // to the member of index 0. 2619 // 2620 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2621 // b = A[i]; // Member of index 0 2622 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2623 // 2624 // E.g. A[i+1] = a; // Member of index 1 2625 // A[i] = b; // Member of index 0 2626 // A[i+2] = c; // Member of index 2 (Current instruction) 2627 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2628 2629 bool InBounds = false; 2630 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2631 InBounds = gep->isInBounds(); 2632 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds); 2633 2634 // Cast to the vector pointer type. 2635 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2636 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2637 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2638 } 2639 2640 State.setDebugLocFromInst(Instr); 2641 Value *PoisonVec = PoisonValue::get(VecTy); 2642 2643 auto CreateGroupMask = [this, &BlockInMask, &State, &InterleaveFactor]( 2644 unsigned Part, Value *MaskForGaps) -> Value * { 2645 if (VF.isScalable()) { 2646 assert(!MaskForGaps && "Interleaved groups with gaps are not supported."); 2647 assert(InterleaveFactor == 2 && 2648 "Unsupported deinterleave factor for scalable vectors"); 2649 auto *BlockInMaskPart = State.get(BlockInMask, Part); 2650 SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart}; 2651 auto *MaskTy = 2652 VectorType::get(Builder.getInt1Ty(), VF.getKnownMinValue() * 2, true); 2653 return Builder.CreateIntrinsic( 2654 MaskTy, Intrinsic::experimental_vector_interleave2, Ops, 2655 /*FMFSource=*/nullptr, "interleaved.mask"); 2656 } 2657 2658 if (!BlockInMask) 2659 return MaskForGaps; 2660 2661 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2662 Value *ShuffledMask = Builder.CreateShuffleVector( 2663 BlockInMaskPart, 2664 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2665 "interleaved.mask"); 2666 return MaskForGaps ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2667 MaskForGaps) 2668 : ShuffledMask; 2669 }; 2670 2671 // Vectorize the interleaved load group. 2672 if (isa<LoadInst>(Instr)) { 2673 Value *MaskForGaps = nullptr; 2674 if (NeedsMaskForGaps) { 2675 MaskForGaps = 2676 createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2677 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2678 } 2679 2680 // For each unroll part, create a wide load for the group. 2681 SmallVector<Value *, 2> NewLoads; 2682 for (unsigned Part = 0; Part < UF; Part++) { 2683 Instruction *NewLoad; 2684 if (BlockInMask || MaskForGaps) { 2685 assert(useMaskedInterleavedAccesses(*TTI) && 2686 "masked interleaved groups are not allowed."); 2687 Value *GroupMask = CreateGroupMask(Part, MaskForGaps); 2688 NewLoad = 2689 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), 2690 GroupMask, PoisonVec, "wide.masked.vec"); 2691 } 2692 else 2693 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2694 Group->getAlign(), "wide.vec"); 2695 Group->addMetadata(NewLoad); 2696 NewLoads.push_back(NewLoad); 2697 } 2698 2699 if (VecTy->isScalableTy()) { 2700 assert(InterleaveFactor == 2 && 2701 "Unsupported deinterleave factor for scalable vectors"); 2702 2703 for (unsigned Part = 0; Part < UF; ++Part) { 2704 // Scalable vectors cannot use arbitrary shufflevectors (only splats), 2705 // so must use intrinsics to deinterleave. 2706 Value *DI = Builder.CreateIntrinsic( 2707 Intrinsic::experimental_vector_deinterleave2, VecTy, NewLoads[Part], 2708 /*FMFSource=*/nullptr, "strided.vec"); 2709 unsigned J = 0; 2710 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2711 Instruction *Member = Group->getMember(I); 2712 2713 if (!Member) 2714 continue; 2715 2716 Value *StridedVec = Builder.CreateExtractValue(DI, I); 2717 // If this member has different type, cast the result type. 2718 if (Member->getType() != ScalarTy) { 2719 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2720 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2721 } 2722 2723 if (Group->isReverse()) 2724 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); 2725 2726 State.set(VPDefs[J], StridedVec, Part); 2727 ++J; 2728 } 2729 } 2730 2731 return; 2732 } 2733 2734 // For each member in the group, shuffle out the appropriate data from the 2735 // wide loads. 2736 unsigned J = 0; 2737 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2738 Instruction *Member = Group->getMember(I); 2739 2740 // Skip the gaps in the group. 2741 if (!Member) 2742 continue; 2743 2744 auto StrideMask = 2745 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2746 for (unsigned Part = 0; Part < UF; Part++) { 2747 Value *StridedVec = Builder.CreateShuffleVector( 2748 NewLoads[Part], StrideMask, "strided.vec"); 2749 2750 // If this member has different type, cast the result type. 2751 if (Member->getType() != ScalarTy) { 2752 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2753 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2754 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2755 } 2756 2757 if (Group->isReverse()) 2758 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); 2759 2760 State.set(VPDefs[J], StridedVec, Part); 2761 } 2762 ++J; 2763 } 2764 return; 2765 } 2766 2767 // The sub vector type for current instruction. 2768 auto *SubVT = VectorType::get(ScalarTy, VF); 2769 2770 // Vectorize the interleaved store group. 2771 Value *MaskForGaps = 2772 createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2773 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && 2774 "masked interleaved groups are not allowed."); 2775 assert((!MaskForGaps || !VF.isScalable()) && 2776 "masking gaps for scalable vectors is not yet supported."); 2777 for (unsigned Part = 0; Part < UF; Part++) { 2778 // Collect the stored vector from each member. 2779 SmallVector<Value *, 4> StoredVecs; 2780 unsigned StoredIdx = 0; 2781 for (unsigned i = 0; i < InterleaveFactor; i++) { 2782 assert((Group->getMember(i) || MaskForGaps) && 2783 "Fail to get a member from an interleaved store group"); 2784 Instruction *Member = Group->getMember(i); 2785 2786 // Skip the gaps in the group. 2787 if (!Member) { 2788 Value *Undef = PoisonValue::get(SubVT); 2789 StoredVecs.push_back(Undef); 2790 continue; 2791 } 2792 2793 Value *StoredVec = State.get(StoredValues[StoredIdx], Part); 2794 ++StoredIdx; 2795 2796 if (Group->isReverse()) 2797 StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse"); 2798 2799 // If this member has different type, cast it to a unified type. 2800 2801 if (StoredVec->getType() != SubVT) 2802 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2803 2804 StoredVecs.push_back(StoredVec); 2805 } 2806 2807 // Interleave all the smaller vectors into one wider vector. 2808 Value *IVec = interleaveVectors(Builder, StoredVecs, "interleaved.vec"); 2809 Instruction *NewStoreInstr; 2810 if (BlockInMask || MaskForGaps) { 2811 Value *GroupMask = CreateGroupMask(Part, MaskForGaps); 2812 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], 2813 Group->getAlign(), GroupMask); 2814 } else 2815 NewStoreInstr = 2816 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2817 2818 Group->addMetadata(NewStoreInstr); 2819 } 2820 } 2821 2822 void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr, 2823 VPReplicateRecipe *RepRecipe, 2824 const VPIteration &Instance, 2825 VPTransformState &State) { 2826 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2827 2828 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 2829 // the first lane and part. 2830 if (isa<NoAliasScopeDeclInst>(Instr)) 2831 if (!Instance.isFirstIteration()) 2832 return; 2833 2834 // Does this instruction return a value ? 2835 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2836 2837 Instruction *Cloned = Instr->clone(); 2838 if (!IsVoidRetTy) 2839 Cloned->setName(Instr->getName() + ".cloned"); 2840 2841 RepRecipe->setFlags(Cloned); 2842 2843 if (Instr->getDebugLoc()) 2844 State.setDebugLocFromInst(Instr); 2845 2846 // Replace the operands of the cloned instructions with their scalar 2847 // equivalents in the new loop. 2848 for (const auto &I : enumerate(RepRecipe->operands())) { 2849 auto InputInstance = Instance; 2850 VPValue *Operand = I.value(); 2851 if (vputils::isUniformAfterVectorization(Operand)) 2852 InputInstance.Lane = VPLane::getFirstLane(); 2853 Cloned->setOperand(I.index(), State.get(Operand, InputInstance)); 2854 } 2855 State.addNewMetadata(Cloned, Instr); 2856 2857 // Place the cloned scalar in the new loop. 2858 State.Builder.Insert(Cloned); 2859 2860 State.set(RepRecipe, Cloned, Instance); 2861 2862 // If we just cloned a new assumption, add it the assumption cache. 2863 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 2864 AC->registerAssumption(II); 2865 2866 // End if-block. 2867 bool IfPredicateInstr = RepRecipe->getParent()->getParent()->isReplicator(); 2868 if (IfPredicateInstr) 2869 PredicatedInstructions.push_back(Cloned); 2870 } 2871 2872 Value * 2873 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { 2874 if (VectorTripCount) 2875 return VectorTripCount; 2876 2877 Value *TC = getTripCount(); 2878 IRBuilder<> Builder(InsertBlock->getTerminator()); 2879 2880 Type *Ty = TC->getType(); 2881 // This is where we can make the step a runtime constant. 2882 Value *Step = createStepForVF(Builder, Ty, VF, UF); 2883 2884 // If the tail is to be folded by masking, round the number of iterations N 2885 // up to a multiple of Step instead of rounding down. This is done by first 2886 // adding Step-1 and then rounding down. Note that it's ok if this addition 2887 // overflows: the vector induction variable will eventually wrap to zero given 2888 // that it starts at zero and its Step is a power of two; the loop will then 2889 // exit, with the last early-exit vector comparison also producing all-true. 2890 // For scalable vectors the VF is not guaranteed to be a power of 2, but this 2891 // is accounted for in emitIterationCountCheck that adds an overflow check. 2892 if (Cost->foldTailByMasking()) { 2893 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 2894 "VF*UF must be a power of 2 when folding tail by masking"); 2895 Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF); 2896 TC = Builder.CreateAdd( 2897 TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up"); 2898 } 2899 2900 // Now we need to generate the expression for the part of the loop that the 2901 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2902 // iterations are not required for correctness, or N - Step, otherwise. Step 2903 // is equal to the vectorization factor (number of SIMD elements) times the 2904 // unroll factor (number of SIMD instructions). 2905 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2906 2907 // There are cases where we *must* run at least one iteration in the remainder 2908 // loop. See the cost model for when this can happen. If the step evenly 2909 // divides the trip count, we set the remainder to be equal to the step. If 2910 // the step does not evenly divide the trip count, no adjustment is necessary 2911 // since there will already be scalar iterations. Note that the minimum 2912 // iterations check ensures that N >= Step. 2913 if (Cost->requiresScalarEpilogue(VF.isVector())) { 2914 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2915 R = Builder.CreateSelect(IsZero, Step, R); 2916 } 2917 2918 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2919 2920 return VectorTripCount; 2921 } 2922 2923 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2924 const DataLayout &DL) { 2925 // Verify that V is a vector type with same number of elements as DstVTy. 2926 auto *DstFVTy = cast<VectorType>(DstVTy); 2927 auto VF = DstFVTy->getElementCount(); 2928 auto *SrcVecTy = cast<VectorType>(V->getType()); 2929 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match"); 2930 Type *SrcElemTy = SrcVecTy->getElementType(); 2931 Type *DstElemTy = DstFVTy->getElementType(); 2932 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2933 "Vector elements must have same size"); 2934 2935 // Do a direct cast if element types are castable. 2936 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2937 return Builder.CreateBitOrPointerCast(V, DstFVTy); 2938 } 2939 // V cannot be directly casted to desired vector type. 2940 // May happen when V is a floating point vector but DstVTy is a vector of 2941 // pointers or vice-versa. Handle this using a two-step bitcast using an 2942 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2943 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2944 "Only one type should be a pointer type"); 2945 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2946 "Only one type should be a floating point type"); 2947 Type *IntTy = 2948 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2949 auto *VecIntTy = VectorType::get(IntTy, VF); 2950 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2951 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 2952 } 2953 2954 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { 2955 Value *Count = getTripCount(); 2956 // Reuse existing vector loop preheader for TC checks. 2957 // Note that new preheader block is generated for vector loop. 2958 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2959 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2960 2961 // Generate code to check if the loop's trip count is less than VF * UF, or 2962 // equal to it in case a scalar epilogue is required; this implies that the 2963 // vector trip count is zero. This check also covers the case where adding one 2964 // to the backedge-taken count overflowed leading to an incorrect trip count 2965 // of zero. In this case we will also jump to the scalar loop. 2966 auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE 2967 : ICmpInst::ICMP_ULT; 2968 2969 // If tail is to be folded, vector loop takes care of all iterations. 2970 Type *CountTy = Count->getType(); 2971 Value *CheckMinIters = Builder.getFalse(); 2972 auto CreateStep = [&]() -> Value * { 2973 // Create step with max(MinProTripCount, UF * VF). 2974 if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue()) 2975 return createStepForVF(Builder, CountTy, VF, UF); 2976 2977 Value *MinProfTC = 2978 createStepForVF(Builder, CountTy, MinProfitableTripCount, 1); 2979 if (!VF.isScalable()) 2980 return MinProfTC; 2981 return Builder.CreateBinaryIntrinsic( 2982 Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF)); 2983 }; 2984 2985 TailFoldingStyle Style = Cost->getTailFoldingStyle(); 2986 if (Style == TailFoldingStyle::None) 2987 CheckMinIters = 2988 Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check"); 2989 else if (VF.isScalable() && 2990 !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) && 2991 Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) { 2992 // vscale is not necessarily a power-of-2, which means we cannot guarantee 2993 // an overflow to zero when updating induction variables and so an 2994 // additional overflow check is required before entering the vector loop. 2995 2996 // Get the maximum unsigned value for the type. 2997 Value *MaxUIntTripCount = 2998 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask()); 2999 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count); 3000 3001 // Don't execute the vector loop if (UMax - n) < (VF * UF). 3002 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep()); 3003 } 3004 3005 // Create new preheader for vector loop. 3006 LoopVectorPreHeader = 3007 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3008 "vector.ph"); 3009 3010 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3011 DT->getNode(Bypass)->getIDom()) && 3012 "TC check is expected to dominate Bypass"); 3013 3014 // Update dominator for Bypass & LoopExit (if needed). 3015 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3016 if (!Cost->requiresScalarEpilogue(VF.isVector())) 3017 // If there is an epilogue which must run, there's no edge from the 3018 // middle block to exit blocks and thus no need to update the immediate 3019 // dominator of the exit blocks. 3020 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3021 3022 ReplaceInstWithInst( 3023 TCCheckBlock->getTerminator(), 3024 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3025 LoopBypassBlocks.push_back(TCCheckBlock); 3026 } 3027 3028 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { 3029 BasicBlock *const SCEVCheckBlock = 3030 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock); 3031 if (!SCEVCheckBlock) 3032 return nullptr; 3033 3034 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3035 (OptForSizeBasedOnProfile && 3036 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3037 "Cannot SCEV check stride or overflow when optimizing for size"); 3038 3039 3040 // Update dominator only if this is first RT check. 3041 if (LoopBypassBlocks.empty()) { 3042 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3043 if (!Cost->requiresScalarEpilogue(VF.isVector())) 3044 // If there is an epilogue which must run, there's no edge from the 3045 // middle block to exit blocks and thus no need to update the immediate 3046 // dominator of the exit blocks. 3047 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3048 } 3049 3050 LoopBypassBlocks.push_back(SCEVCheckBlock); 3051 AddedSafetyChecks = true; 3052 return SCEVCheckBlock; 3053 } 3054 3055 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) { 3056 // VPlan-native path does not do any analysis for runtime checks currently. 3057 if (EnableVPlanNativePath) 3058 return nullptr; 3059 3060 BasicBlock *const MemCheckBlock = 3061 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader); 3062 3063 // Check if we generated code that checks in runtime if arrays overlap. We put 3064 // the checks into a separate block to make the more common case of few 3065 // elements faster. 3066 if (!MemCheckBlock) 3067 return nullptr; 3068 3069 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3070 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3071 "Cannot emit memory checks when optimizing for size, unless forced " 3072 "to vectorize."); 3073 ORE->emit([&]() { 3074 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3075 OrigLoop->getStartLoc(), 3076 OrigLoop->getHeader()) 3077 << "Code-size may be reduced by not forcing " 3078 "vectorization, or by source-code modifications " 3079 "eliminating the need for runtime checks " 3080 "(e.g., adding 'restrict')."; 3081 }); 3082 } 3083 3084 LoopBypassBlocks.push_back(MemCheckBlock); 3085 3086 AddedSafetyChecks = true; 3087 3088 return MemCheckBlock; 3089 } 3090 3091 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3092 LoopScalarBody = OrigLoop->getHeader(); 3093 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3094 assert(LoopVectorPreHeader && "Invalid loop structure"); 3095 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr 3096 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) && 3097 "multiple exit loop without required epilogue?"); 3098 3099 LoopMiddleBlock = 3100 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3101 LI, nullptr, Twine(Prefix) + "middle.block"); 3102 LoopScalarPreHeader = 3103 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3104 nullptr, Twine(Prefix) + "scalar.ph"); 3105 3106 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3107 3108 // Set up the middle block terminator. Two cases: 3109 // 1) If we know that we must execute the scalar epilogue, emit an 3110 // unconditional branch. 3111 // 2) Otherwise, we must have a single unique exit block (due to how we 3112 // implement the multiple exit case). In this case, set up a conditional 3113 // branch from the middle block to the loop scalar preheader, and the 3114 // exit block. completeLoopSkeleton will update the condition to use an 3115 // iteration check, if required to decide whether to execute the remainder. 3116 BranchInst *BrInst = 3117 Cost->requiresScalarEpilogue(VF.isVector()) 3118 ? BranchInst::Create(LoopScalarPreHeader) 3119 : BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, 3120 Builder.getTrue()); 3121 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3122 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3123 3124 // Update dominator for loop exit. During skeleton creation, only the vector 3125 // pre-header and the middle block are created. The vector loop is entirely 3126 // created during VPlan exection. 3127 if (!Cost->requiresScalarEpilogue(VF.isVector())) 3128 // If there is an epilogue which must run, there's no edge from the 3129 // middle block to exit blocks and thus no need to update the immediate 3130 // dominator of the exit blocks. 3131 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3132 } 3133 3134 PHINode *InnerLoopVectorizer::createInductionResumeValue( 3135 PHINode *OrigPhi, const InductionDescriptor &II, Value *Step, 3136 ArrayRef<BasicBlock *> BypassBlocks, 3137 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3138 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 3139 assert(VectorTripCount && "Expected valid arguments"); 3140 3141 Instruction *OldInduction = Legal->getPrimaryInduction(); 3142 Value *&EndValue = IVEndValues[OrigPhi]; 3143 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3144 if (OrigPhi == OldInduction) { 3145 // We know what the end value is. 3146 EndValue = VectorTripCount; 3147 } else { 3148 IRBuilder<> B(LoopVectorPreHeader->getTerminator()); 3149 3150 // Fast-math-flags propagate from the original induction instruction. 3151 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3152 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3153 3154 EndValue = 3155 emitTransformedIndex(B, VectorTripCount, II.getStartValue(), Step, II); 3156 EndValue->setName("ind.end"); 3157 3158 // Compute the end value for the additional bypass (if applicable). 3159 if (AdditionalBypass.first) { 3160 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3161 EndValueFromAdditionalBypass = emitTransformedIndex( 3162 B, AdditionalBypass.second, II.getStartValue(), Step, II); 3163 EndValueFromAdditionalBypass->setName("ind.end"); 3164 } 3165 } 3166 3167 // Create phi nodes to merge from the backedge-taken check block. 3168 PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3169 LoopScalarPreHeader->getTerminator()); 3170 // Copy original phi DL over to the new one. 3171 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3172 3173 // The new PHI merges the original incoming value, in case of a bypass, 3174 // or the value at the end of the vectorized loop. 3175 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3176 3177 // Fix the scalar body counter (PHI node). 3178 // The old induction's phi node in the scalar body needs the truncated 3179 // value. 3180 for (BasicBlock *BB : BypassBlocks) 3181 BCResumeVal->addIncoming(II.getStartValue(), BB); 3182 3183 if (AdditionalBypass.first) 3184 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3185 EndValueFromAdditionalBypass); 3186 return BCResumeVal; 3187 } 3188 3189 /// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV 3190 /// expansion results. 3191 static Value *getExpandedStep(const InductionDescriptor &ID, 3192 const SCEV2ValueTy &ExpandedSCEVs) { 3193 const SCEV *Step = ID.getStep(); 3194 if (auto *C = dyn_cast<SCEVConstant>(Step)) 3195 return C->getValue(); 3196 if (auto *U = dyn_cast<SCEVUnknown>(Step)) 3197 return U->getValue(); 3198 auto I = ExpandedSCEVs.find(Step); 3199 assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point"); 3200 return I->second; 3201 } 3202 3203 void InnerLoopVectorizer::createInductionResumeValues( 3204 const SCEV2ValueTy &ExpandedSCEVs, 3205 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3206 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3207 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3208 "Inconsistent information about additional bypass."); 3209 // We are going to resume the execution of the scalar loop. 3210 // Go over all of the induction variables that we found and fix the 3211 // PHIs that are left in the scalar version of the loop. 3212 // The starting values of PHI nodes depend on the counter of the last 3213 // iteration in the vectorized loop. 3214 // If we come from a bypass edge then we need to start from the original 3215 // start value. 3216 for (const auto &InductionEntry : Legal->getInductionVars()) { 3217 PHINode *OrigPhi = InductionEntry.first; 3218 const InductionDescriptor &II = InductionEntry.second; 3219 PHINode *BCResumeVal = createInductionResumeValue( 3220 OrigPhi, II, getExpandedStep(II, ExpandedSCEVs), LoopBypassBlocks, 3221 AdditionalBypass); 3222 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3223 } 3224 } 3225 3226 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() { 3227 // The trip counts should be cached by now. 3228 Value *Count = getTripCount(); 3229 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 3230 3231 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3232 3233 // Add a check in the middle block to see if we have completed 3234 // all of the iterations in the first vector loop. Three cases: 3235 // 1) If we require a scalar epilogue, there is no conditional branch as 3236 // we unconditionally branch to the scalar preheader. Do nothing. 3237 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. 3238 // Thus if tail is to be folded, we know we don't need to run the 3239 // remainder and we can use the previous value for the condition (true). 3240 // 3) Otherwise, construct a runtime check. 3241 if (!Cost->requiresScalarEpilogue(VF.isVector()) && 3242 !Cost->foldTailByMasking()) { 3243 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3244 Count, VectorTripCount, "cmp.n", 3245 LoopMiddleBlock->getTerminator()); 3246 3247 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3248 // of the corresponding compare because they may have ended up with 3249 // different line numbers and we want to avoid awkward line stepping while 3250 // debugging. Eg. if the compare has got a line number inside the loop. 3251 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3252 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3253 } 3254 3255 #ifdef EXPENSIVE_CHECKS 3256 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3257 #endif 3258 3259 return LoopVectorPreHeader; 3260 } 3261 3262 std::pair<BasicBlock *, Value *> 3263 InnerLoopVectorizer::createVectorizedLoopSkeleton( 3264 const SCEV2ValueTy &ExpandedSCEVs) { 3265 /* 3266 In this function we generate a new loop. The new loop will contain 3267 the vectorized instructions while the old loop will continue to run the 3268 scalar remainder. 3269 3270 [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's 3271 / | preheader are expanded here. Eventually all required SCEV 3272 / | expansion should happen here. 3273 / v 3274 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3275 | / | 3276 | / v 3277 || [ ] <-- vector pre header. 3278 |/ | 3279 | v 3280 | [ ] \ 3281 | [ ]_| <-- vector loop (created during VPlan execution). 3282 | | 3283 | v 3284 \ -[ ] <--- middle-block. 3285 \/ | 3286 /\ v 3287 | ->[ ] <--- new preheader. 3288 | | 3289 (opt) v <-- edge from middle to exit iff epilogue is not required. 3290 | [ ] \ 3291 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). 3292 \ | 3293 \ v 3294 >[ ] <-- exit block(s). 3295 ... 3296 */ 3297 3298 // Create an empty vector loop, and prepare basic blocks for the runtime 3299 // checks. 3300 createVectorLoopSkeleton(""); 3301 3302 // Now, compare the new count to zero. If it is zero skip the vector loop and 3303 // jump to the scalar loop. This check also covers the case where the 3304 // backedge-taken count is uint##_max: adding one to it will overflow leading 3305 // to an incorrect trip count of zero. In this (rare) case we will also jump 3306 // to the scalar loop. 3307 emitIterationCountCheck(LoopScalarPreHeader); 3308 3309 // Generate the code to check any assumptions that we've made for SCEV 3310 // expressions. 3311 emitSCEVChecks(LoopScalarPreHeader); 3312 3313 // Generate the code that checks in runtime if arrays overlap. We put the 3314 // checks into a separate block to make the more common case of few elements 3315 // faster. 3316 emitMemRuntimeChecks(LoopScalarPreHeader); 3317 3318 // Emit phis for the new starting index of the scalar loop. 3319 createInductionResumeValues(ExpandedSCEVs); 3320 3321 return {completeLoopSkeleton(), nullptr}; 3322 } 3323 3324 // Fix up external users of the induction variable. At this point, we are 3325 // in LCSSA form, with all external PHIs that use the IV having one input value, 3326 // coming from the remainder loop. We need those PHIs to also have a correct 3327 // value for the IV when arriving directly from the middle block. 3328 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3329 const InductionDescriptor &II, 3330 Value *VectorTripCount, Value *EndValue, 3331 BasicBlock *MiddleBlock, 3332 BasicBlock *VectorHeader, VPlan &Plan, 3333 VPTransformState &State) { 3334 // There are two kinds of external IV usages - those that use the value 3335 // computed in the last iteration (the PHI) and those that use the penultimate 3336 // value (the value that feeds into the phi from the loop latch). 3337 // We allow both, but they, obviously, have different values. 3338 3339 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3340 3341 DenseMap<Value *, Value *> MissingVals; 3342 3343 // An external user of the last iteration's value should see the value that 3344 // the remainder loop uses to initialize its own IV. 3345 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3346 for (User *U : PostInc->users()) { 3347 Instruction *UI = cast<Instruction>(U); 3348 if (!OrigLoop->contains(UI)) { 3349 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3350 MissingVals[UI] = EndValue; 3351 } 3352 } 3353 3354 // An external user of the penultimate value need to see EndValue - Step. 3355 // The simplest way to get this is to recompute it from the constituent SCEVs, 3356 // that is Start + (Step * (CRD - 1)). 3357 for (User *U : OrigPhi->users()) { 3358 auto *UI = cast<Instruction>(U); 3359 if (!OrigLoop->contains(UI)) { 3360 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3361 IRBuilder<> B(MiddleBlock->getTerminator()); 3362 3363 // Fast-math-flags propagate from the original induction instruction. 3364 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3365 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3366 3367 Value *CountMinusOne = B.CreateSub( 3368 VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1)); 3369 CountMinusOne->setName("cmo"); 3370 3371 VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep()); 3372 assert(StepVPV && "step must have been expanded during VPlan execution"); 3373 Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue() 3374 : State.get(StepVPV, {0, 0}); 3375 Value *Escape = 3376 emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step, II); 3377 Escape->setName("ind.escape"); 3378 MissingVals[UI] = Escape; 3379 } 3380 } 3381 3382 for (auto &I : MissingVals) { 3383 PHINode *PHI = cast<PHINode>(I.first); 3384 // One corner case we have to handle is two IVs "chasing" each-other, 3385 // that is %IV2 = phi [...], [ %IV1, %latch ] 3386 // In this case, if IV1 has an external use, we need to avoid adding both 3387 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3388 // don't already have an incoming value for the middle block. 3389 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) { 3390 PHI->addIncoming(I.second, MiddleBlock); 3391 Plan.removeLiveOut(PHI); 3392 } 3393 } 3394 } 3395 3396 namespace { 3397 3398 struct CSEDenseMapInfo { 3399 static bool canHandle(const Instruction *I) { 3400 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3401 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3402 } 3403 3404 static inline Instruction *getEmptyKey() { 3405 return DenseMapInfo<Instruction *>::getEmptyKey(); 3406 } 3407 3408 static inline Instruction *getTombstoneKey() { 3409 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3410 } 3411 3412 static unsigned getHashValue(const Instruction *I) { 3413 assert(canHandle(I) && "Unknown instruction!"); 3414 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3415 I->value_op_end())); 3416 } 3417 3418 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3419 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3420 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3421 return LHS == RHS; 3422 return LHS->isIdenticalTo(RHS); 3423 } 3424 }; 3425 3426 } // end anonymous namespace 3427 3428 ///Perform cse of induction variable instructions. 3429 static void cse(BasicBlock *BB) { 3430 // Perform simple cse. 3431 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3432 for (Instruction &In : llvm::make_early_inc_range(*BB)) { 3433 if (!CSEDenseMapInfo::canHandle(&In)) 3434 continue; 3435 3436 // Check if we can replace this instruction with any of the 3437 // visited instructions. 3438 if (Instruction *V = CSEMap.lookup(&In)) { 3439 In.replaceAllUsesWith(V); 3440 In.eraseFromParent(); 3441 continue; 3442 } 3443 3444 CSEMap[&In] = &In; 3445 } 3446 } 3447 3448 InstructionCost LoopVectorizationCostModel::getVectorCallCost( 3449 CallInst *CI, ElementCount VF, Function **Variant, bool *NeedsMask) const { 3450 Function *F = CI->getCalledFunction(); 3451 Type *ScalarRetTy = CI->getType(); 3452 SmallVector<Type *, 4> Tys, ScalarTys; 3453 bool MaskRequired = Legal->isMaskRequired(CI); 3454 for (auto &ArgOp : CI->args()) 3455 ScalarTys.push_back(ArgOp->getType()); 3456 3457 // Estimate cost of scalarized vector call. The source operands are assumed 3458 // to be vectors, so we need to extract individual elements from there, 3459 // execute VF scalar calls, and then gather the result into the vector return 3460 // value. 3461 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 3462 InstructionCost ScalarCallCost = 3463 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, CostKind); 3464 if (VF.isScalar()) 3465 return ScalarCallCost; 3466 3467 // Compute corresponding vector type for return value and arguments. 3468 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3469 for (Type *ScalarTy : ScalarTys) 3470 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3471 3472 // Compute costs of unpacking argument values for the scalar calls and 3473 // packing the return values to a vector. 3474 InstructionCost ScalarizationCost = 3475 getScalarizationOverhead(CI, VF, CostKind); 3476 3477 InstructionCost Cost = 3478 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3479 3480 // If we can't emit a vector call for this function, then the currently found 3481 // cost is the cost we need to return. 3482 InstructionCost MaskCost = 0; 3483 VFShape Shape = VFShape::get(*CI, VF, MaskRequired); 3484 if (NeedsMask) 3485 *NeedsMask = MaskRequired; 3486 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3487 // If we want an unmasked vector function but can't find one matching the VF, 3488 // maybe we can find vector function that does use a mask and synthesize 3489 // an all-true mask. 3490 if (!VecFunc && !MaskRequired) { 3491 Shape = VFShape::get(*CI, VF, /*HasGlobalPred=*/true); 3492 VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3493 // If we found one, add in the cost of creating a mask 3494 if (VecFunc) { 3495 if (NeedsMask) 3496 *NeedsMask = true; 3497 MaskCost = TTI.getShuffleCost( 3498 TargetTransformInfo::SK_Broadcast, 3499 VectorType::get( 3500 IntegerType::getInt1Ty(VecFunc->getFunctionType()->getContext()), 3501 VF)); 3502 } 3503 } 3504 3505 // We don't support masked function calls yet, but we can scalarize a 3506 // masked call with branches (unless VF is scalable). 3507 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3508 return VF.isScalable() ? InstructionCost::getInvalid() : Cost; 3509 3510 // If the corresponding vector cost is cheaper, return its cost. 3511 InstructionCost VectorCallCost = 3512 TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost; 3513 if (VectorCallCost < Cost) { 3514 *Variant = VecFunc; 3515 Cost = VectorCallCost; 3516 } 3517 return Cost; 3518 } 3519 3520 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3521 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3522 return Elt; 3523 return VectorType::get(Elt, VF); 3524 } 3525 3526 InstructionCost 3527 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3528 ElementCount VF) const { 3529 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3530 assert(ID && "Expected intrinsic call!"); 3531 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3532 FastMathFlags FMF; 3533 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3534 FMF = FPMO->getFastMathFlags(); 3535 3536 SmallVector<const Value *> Arguments(CI->args()); 3537 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3538 SmallVector<Type *> ParamTys; 3539 std::transform(FTy->param_begin(), FTy->param_end(), 3540 std::back_inserter(ParamTys), 3541 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3542 3543 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3544 dyn_cast<IntrinsicInst>(CI)); 3545 return TTI.getIntrinsicInstrCost(CostAttrs, 3546 TargetTransformInfo::TCK_RecipThroughput); 3547 } 3548 3549 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3550 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3551 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3552 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3553 } 3554 3555 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3556 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3557 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3558 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3559 } 3560 3561 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3562 // For every instruction `I` in MinBWs, truncate the operands, create a 3563 // truncated version of `I` and reextend its result. InstCombine runs 3564 // later and will remove any ext/trunc pairs. 3565 SmallPtrSet<Value *, 4> Erased; 3566 for (const auto &KV : Cost->getMinimalBitwidths()) { 3567 // If the value wasn't vectorized, we must maintain the original scalar 3568 // type. The absence of the value from State indicates that it 3569 // wasn't vectorized. 3570 // FIXME: Should not rely on getVPValue at this point. 3571 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3572 if (!State.hasAnyVectorValue(Def)) 3573 continue; 3574 for (unsigned Part = 0; Part < UF; ++Part) { 3575 Value *I = State.get(Def, Part); 3576 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3577 continue; 3578 Type *OriginalTy = I->getType(); 3579 Type *ScalarTruncatedTy = 3580 IntegerType::get(OriginalTy->getContext(), KV.second); 3581 auto *TruncatedTy = VectorType::get( 3582 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount()); 3583 if (TruncatedTy == OriginalTy) 3584 continue; 3585 3586 IRBuilder<> B(cast<Instruction>(I)); 3587 auto ShrinkOperand = [&](Value *V) -> Value * { 3588 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3589 if (ZI->getSrcTy() == TruncatedTy) 3590 return ZI->getOperand(0); 3591 return B.CreateZExtOrTrunc(V, TruncatedTy); 3592 }; 3593 3594 // The actual instruction modification depends on the instruction type, 3595 // unfortunately. 3596 Value *NewI = nullptr; 3597 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3598 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3599 ShrinkOperand(BO->getOperand(1))); 3600 3601 // Any wrapping introduced by shrinking this operation shouldn't be 3602 // considered undefined behavior. So, we can't unconditionally copy 3603 // arithmetic wrapping flags to NewI. 3604 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3605 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3606 NewI = 3607 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3608 ShrinkOperand(CI->getOperand(1))); 3609 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3610 NewI = B.CreateSelect(SI->getCondition(), 3611 ShrinkOperand(SI->getTrueValue()), 3612 ShrinkOperand(SI->getFalseValue())); 3613 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3614 switch (CI->getOpcode()) { 3615 default: 3616 llvm_unreachable("Unhandled cast!"); 3617 case Instruction::Trunc: 3618 NewI = ShrinkOperand(CI->getOperand(0)); 3619 break; 3620 case Instruction::SExt: 3621 NewI = B.CreateSExtOrTrunc( 3622 CI->getOperand(0), 3623 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3624 break; 3625 case Instruction::ZExt: 3626 NewI = B.CreateZExtOrTrunc( 3627 CI->getOperand(0), 3628 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3629 break; 3630 } 3631 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3632 auto Elements0 = 3633 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount(); 3634 auto *O0 = B.CreateZExtOrTrunc( 3635 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3636 auto Elements1 = 3637 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount(); 3638 auto *O1 = B.CreateZExtOrTrunc( 3639 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3640 3641 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3642 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3643 // Don't do anything with the operands, just extend the result. 3644 continue; 3645 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3646 auto Elements = 3647 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount(); 3648 auto *O0 = B.CreateZExtOrTrunc( 3649 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3650 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3651 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3652 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3653 auto Elements = 3654 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount(); 3655 auto *O0 = B.CreateZExtOrTrunc( 3656 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3657 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3658 } else { 3659 // If we don't know what to do, be conservative and don't do anything. 3660 continue; 3661 } 3662 3663 // Lastly, extend the result. 3664 NewI->takeName(cast<Instruction>(I)); 3665 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3666 I->replaceAllUsesWith(Res); 3667 cast<Instruction>(I)->eraseFromParent(); 3668 Erased.insert(I); 3669 State.reset(Def, Res, Part); 3670 } 3671 } 3672 3673 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3674 for (const auto &KV : Cost->getMinimalBitwidths()) { 3675 // If the value wasn't vectorized, we must maintain the original scalar 3676 // type. The absence of the value from State indicates that it 3677 // wasn't vectorized. 3678 // FIXME: Should not rely on getVPValue at this point. 3679 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3680 if (!State.hasAnyVectorValue(Def)) 3681 continue; 3682 for (unsigned Part = 0; Part < UF; ++Part) { 3683 Value *I = State.get(Def, Part); 3684 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3685 if (Inst && Inst->use_empty()) { 3686 Value *NewI = Inst->getOperand(0); 3687 Inst->eraseFromParent(); 3688 State.reset(Def, NewI, Part); 3689 } 3690 } 3691 } 3692 } 3693 3694 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, 3695 VPlan &Plan) { 3696 // Insert truncates and extends for any truncated instructions as hints to 3697 // InstCombine. 3698 if (VF.isVector()) 3699 truncateToMinimalBitwidths(State); 3700 3701 // Fix widened non-induction PHIs by setting up the PHI operands. 3702 if (EnableVPlanNativePath) 3703 fixNonInductionPHIs(Plan, State); 3704 3705 // At this point every instruction in the original loop is widened to a 3706 // vector form. Now we need to fix the recurrences in the loop. These PHI 3707 // nodes are currently empty because we did not want to introduce cycles. 3708 // This is the second stage of vectorizing recurrences. 3709 fixCrossIterationPHIs(State); 3710 3711 // Forget the original basic block. 3712 PSE.getSE()->forgetLoop(OrigLoop); 3713 3714 // After vectorization, the exit blocks of the original loop will have 3715 // additional predecessors. Invalidate SCEVs for the exit phis in case SE 3716 // looked through single-entry phis. 3717 SmallVector<BasicBlock *> ExitBlocks; 3718 OrigLoop->getExitBlocks(ExitBlocks); 3719 for (BasicBlock *Exit : ExitBlocks) 3720 for (PHINode &PN : Exit->phis()) 3721 PSE.getSE()->forgetValue(&PN); 3722 3723 VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock(); 3724 Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]); 3725 if (Cost->requiresScalarEpilogue(VF.isVector())) { 3726 // No edge from the middle block to the unique exit block has been inserted 3727 // and there is nothing to fix from vector loop; phis should have incoming 3728 // from scalar loop only. 3729 } else { 3730 // TODO: Check VPLiveOuts to see if IV users need fixing instead of checking 3731 // the cost model. 3732 3733 // If we inserted an edge from the middle block to the unique exit block, 3734 // update uses outside the loop (phis) to account for the newly inserted 3735 // edge. 3736 3737 // Fix-up external users of the induction variables. 3738 for (const auto &Entry : Legal->getInductionVars()) 3739 fixupIVUsers(Entry.first, Entry.second, 3740 getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()), 3741 IVEndValues[Entry.first], LoopMiddleBlock, 3742 VectorLoop->getHeader(), Plan, State); 3743 } 3744 3745 // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated 3746 // in the exit block, so update the builder. 3747 State.Builder.SetInsertPoint(State.CFG.ExitBB->getFirstNonPHI()); 3748 for (const auto &KV : Plan.getLiveOuts()) 3749 KV.second->fixPhi(Plan, State); 3750 3751 for (Instruction *PI : PredicatedInstructions) 3752 sinkScalarOperands(&*PI); 3753 3754 // Remove redundant induction instructions. 3755 cse(VectorLoop->getHeader()); 3756 3757 // Set/update profile weights for the vector and remainder loops as original 3758 // loop iterations are now distributed among them. Note that original loop 3759 // represented by LoopScalarBody becomes remainder loop after vectorization. 3760 // 3761 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3762 // end up getting slightly roughened result but that should be OK since 3763 // profile is not inherently precise anyway. Note also possible bypass of 3764 // vector code caused by legality checks is ignored, assigning all the weight 3765 // to the vector loop, optimistically. 3766 // 3767 // For scalable vectorization we can't know at compile time how many iterations 3768 // of the loop are handled in one vector iteration, so instead assume a pessimistic 3769 // vscale of '1'. 3770 setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop, 3771 LI->getLoopFor(LoopScalarBody), 3772 VF.getKnownMinValue() * UF); 3773 } 3774 3775 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 3776 // In order to support recurrences we need to be able to vectorize Phi nodes. 3777 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3778 // stage #2: We now need to fix the recurrences by adding incoming edges to 3779 // the currently empty PHI nodes. At this point every instruction in the 3780 // original loop is widened to a vector form so we can use them to construct 3781 // the incoming edges. 3782 VPBasicBlock *Header = 3783 State.Plan->getVectorLoopRegion()->getEntryBasicBlock(); 3784 for (VPRecipeBase &R : Header->phis()) { 3785 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) 3786 fixReduction(ReductionPhi, State); 3787 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) 3788 fixFixedOrderRecurrence(FOR, State); 3789 } 3790 } 3791 3792 void InnerLoopVectorizer::fixFixedOrderRecurrence( 3793 VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) { 3794 // This is the second phase of vectorizing first-order recurrences. An 3795 // overview of the transformation is described below. Suppose we have the 3796 // following loop. 3797 // 3798 // for (int i = 0; i < n; ++i) 3799 // b[i] = a[i] - a[i - 1]; 3800 // 3801 // There is a first-order recurrence on "a". For this loop, the shorthand 3802 // scalar IR looks like: 3803 // 3804 // scalar.ph: 3805 // s_init = a[-1] 3806 // br scalar.body 3807 // 3808 // scalar.body: 3809 // i = phi [0, scalar.ph], [i+1, scalar.body] 3810 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3811 // s2 = a[i] 3812 // b[i] = s2 - s1 3813 // br cond, scalar.body, ... 3814 // 3815 // In this example, s1 is a recurrence because it's value depends on the 3816 // previous iteration. In the first phase of vectorization, we created a 3817 // vector phi v1 for s1. We now complete the vectorization and produce the 3818 // shorthand vector IR shown below (for VF = 4, UF = 1). 3819 // 3820 // vector.ph: 3821 // v_init = vector(..., ..., ..., a[-1]) 3822 // br vector.body 3823 // 3824 // vector.body 3825 // i = phi [0, vector.ph], [i+4, vector.body] 3826 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3827 // v2 = a[i, i+1, i+2, i+3]; 3828 // v3 = vector(v1(3), v2(0, 1, 2)) 3829 // b[i, i+1, i+2, i+3] = v2 - v3 3830 // br cond, vector.body, middle.block 3831 // 3832 // middle.block: 3833 // x = v2(3) 3834 // br scalar.ph 3835 // 3836 // scalar.ph: 3837 // s_init = phi [x, middle.block], [a[-1], otherwise] 3838 // br scalar.body 3839 // 3840 // After execution completes the vector loop, we extract the next value of 3841 // the recurrence (x) to use as the initial value in the scalar loop. 3842 3843 // Extract the last vector element in the middle block. This will be the 3844 // initial value for the recurrence when jumping to the scalar loop. 3845 VPValue *PreviousDef = PhiR->getBackedgeValue(); 3846 Value *Incoming = State.get(PreviousDef, UF - 1); 3847 auto *ExtractForScalar = Incoming; 3848 auto *IdxTy = Builder.getInt32Ty(); 3849 Value *RuntimeVF = nullptr; 3850 if (VF.isVector()) { 3851 auto *One = ConstantInt::get(IdxTy, 1); 3852 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3853 RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 3854 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 3855 ExtractForScalar = 3856 Builder.CreateExtractElement(Incoming, LastIdx, "vector.recur.extract"); 3857 } 3858 3859 auto RecurSplice = cast<VPInstruction>(*PhiR->user_begin()); 3860 assert(PhiR->getNumUsers() == 1 && 3861 RecurSplice->getOpcode() == 3862 VPInstruction::FirstOrderRecurrenceSplice && 3863 "recurrence phi must have a single user: FirstOrderRecurrenceSplice"); 3864 SmallVector<VPLiveOut *> LiveOuts; 3865 for (VPUser *U : RecurSplice->users()) 3866 if (auto *LiveOut = dyn_cast<VPLiveOut>(U)) 3867 LiveOuts.push_back(LiveOut); 3868 3869 if (!LiveOuts.empty()) { 3870 // Extract the second last element in the middle block if the 3871 // Phi is used outside the loop. We need to extract the phi itself 3872 // and not the last element (the phi update in the current iteration). This 3873 // will be the value when jumping to the exit block from the 3874 // LoopMiddleBlock, when the scalar loop is not run at all. 3875 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3876 if (VF.isVector()) { 3877 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 3878 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3879 Incoming, Idx, "vector.recur.extract.for.phi"); 3880 } else { 3881 assert(UF > 1 && "VF and UF cannot both be 1"); 3882 // When loop is unrolled without vectorizing, initialize 3883 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled 3884 // value of `Incoming`. This is analogous to the vectorized case above: 3885 // extracting the second last element when VF > 1. 3886 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 3887 } 3888 3889 for (VPLiveOut *LiveOut : LiveOuts) { 3890 assert(!Cost->requiresScalarEpilogue(VF.isVector())); 3891 PHINode *LCSSAPhi = LiveOut->getPhi(); 3892 LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3893 State.Plan->removeLiveOut(LCSSAPhi); 3894 } 3895 } 3896 3897 // Fix the initial value of the original recurrence in the scalar loop. 3898 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 3899 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue()); 3900 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3901 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue(); 3902 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3903 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3904 Start->addIncoming(Incoming, BB); 3905 } 3906 3907 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3908 Phi->setName("scalar.recur"); 3909 } 3910 3911 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, 3912 VPTransformState &State) { 3913 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 3914 // Get it's reduction variable descriptor. 3915 assert(Legal->isReductionVariable(OrigPhi) && 3916 "Unable to find the reduction variable"); 3917 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 3918 3919 RecurKind RK = RdxDesc.getRecurrenceKind(); 3920 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3921 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3922 State.setDebugLocFromInst(ReductionStartValue); 3923 3924 VPValue *LoopExitInstDef = PhiR->getBackedgeValue(); 3925 // This is the vector-clone of the value that leaves the loop. 3926 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 3927 3928 // Before each round, move the insertion point right between 3929 // the PHIs and the values we are going to write. 3930 // This allows us to write both PHINodes and the extractelement 3931 // instructions. 3932 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3933 3934 State.setDebugLocFromInst(LoopExitInst); 3935 3936 Type *PhiTy = OrigPhi->getType(); 3937 3938 VPBasicBlock *LatchVPBB = 3939 PhiR->getParent()->getEnclosingLoopRegion()->getExitingBasicBlock(); 3940 BasicBlock *VectorLoopLatch = State.CFG.VPBB2IRBB[LatchVPBB]; 3941 // If tail is folded by masking, the vector value to leave the loop should be 3942 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 3943 // instead of the former. For an inloop reduction the reduction will already 3944 // be predicated, and does not need to be handled here. 3945 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { 3946 for (unsigned Part = 0; Part < UF; ++Part) { 3947 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 3948 SelectInst *Sel = nullptr; 3949 for (User *U : VecLoopExitInst->users()) { 3950 if (isa<SelectInst>(U)) { 3951 assert(!Sel && "Reduction exit feeding two selects"); 3952 Sel = cast<SelectInst>(U); 3953 } else 3954 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 3955 } 3956 assert(Sel && "Reduction exit feeds no select"); 3957 State.reset(LoopExitInstDef, Sel, Part); 3958 3959 if (isa<FPMathOperator>(Sel)) 3960 Sel->setFastMathFlags(RdxDesc.getFastMathFlags()); 3961 3962 // If the target can create a predicated operator for the reduction at no 3963 // extra cost in the loop (for example a predicated vadd), it can be 3964 // cheaper for the select to remain in the loop than be sunk out of it, 3965 // and so use the select value for the phi instead of the old 3966 // LoopExitValue. 3967 if (PreferPredicatedReductionSelect || 3968 TTI->preferPredicatedReductionSelect( 3969 RdxDesc.getOpcode(), PhiTy, 3970 TargetTransformInfo::ReductionFlags())) { 3971 auto *VecRdxPhi = 3972 cast<PHINode>(State.get(PhiR, Part)); 3973 VecRdxPhi->setIncomingValueForBlock(VectorLoopLatch, Sel); 3974 } 3975 } 3976 } 3977 3978 // If the vector reduction can be performed in a smaller type, we truncate 3979 // then extend the loop exit value to enable InstCombine to evaluate the 3980 // entire expression in the smaller type. 3981 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 3982 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); 3983 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 3984 Builder.SetInsertPoint(VectorLoopLatch->getTerminator()); 3985 VectorParts RdxParts(UF); 3986 for (unsigned Part = 0; Part < UF; ++Part) { 3987 RdxParts[Part] = State.get(LoopExitInstDef, Part); 3988 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3989 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 3990 : Builder.CreateZExt(Trunc, VecTy); 3991 for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users())) 3992 if (U != Trunc) { 3993 U->replaceUsesOfWith(RdxParts[Part], Extnd); 3994 RdxParts[Part] = Extnd; 3995 } 3996 } 3997 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3998 for (unsigned Part = 0; Part < UF; ++Part) { 3999 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4000 State.reset(LoopExitInstDef, RdxParts[Part], Part); 4001 } 4002 } 4003 4004 // Reduce all of the unrolled parts into a single vector. 4005 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 4006 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4007 4008 // The middle block terminator has already been assigned a DebugLoc here (the 4009 // OrigLoop's single latch terminator). We want the whole middle block to 4010 // appear to execute on this line because: (a) it is all compiler generated, 4011 // (b) these instructions are always executed after evaluating the latch 4012 // conditional branch, and (c) other passes may add new predecessors which 4013 // terminate on this line. This is the easiest way to ensure we don't 4014 // accidentally cause an extra step back into the loop while debugging. 4015 State.setDebugLocFromInst(LoopMiddleBlock->getTerminator()); 4016 if (PhiR->isOrdered()) 4017 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 4018 else { 4019 // Floating-point operations should have some FMF to enable the reduction. 4020 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 4021 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 4022 for (unsigned Part = 1; Part < UF; ++Part) { 4023 Value *RdxPart = State.get(LoopExitInstDef, Part); 4024 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 4025 ReducedPartRdx = Builder.CreateBinOp( 4026 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 4027 } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) 4028 ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK, 4029 ReducedPartRdx, RdxPart); 4030 else 4031 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4032 } 4033 } 4034 4035 // Create the reduction after the loop. Note that inloop reductions create the 4036 // target reduction in the loop using a Reduction recipe. 4037 if (VF.isVector() && !PhiR->isInLoop()) { 4038 ReducedPartRdx = 4039 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi); 4040 // If the reduction can be performed in a smaller type, we need to extend 4041 // the reduction to the wider type before we branch to the original loop. 4042 if (PhiTy != RdxDesc.getRecurrenceType()) 4043 ReducedPartRdx = RdxDesc.isSigned() 4044 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 4045 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 4046 } 4047 4048 PHINode *ResumePhi = 4049 dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue()); 4050 4051 // Create a phi node that merges control-flow from the backedge-taken check 4052 // block and the middle block. 4053 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 4054 LoopScalarPreHeader->getTerminator()); 4055 4056 // If we are fixing reductions in the epilogue loop then we should already 4057 // have created a bc.merge.rdx Phi after the main vector body. Ensure that 4058 // we carry over the incoming values correctly. 4059 for (auto *Incoming : predecessors(LoopScalarPreHeader)) { 4060 if (Incoming == LoopMiddleBlock) 4061 BCBlockPhi->addIncoming(ReducedPartRdx, Incoming); 4062 else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming)) 4063 BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming), 4064 Incoming); 4065 else 4066 BCBlockPhi->addIncoming(ReductionStartValue, Incoming); 4067 } 4068 4069 // Set the resume value for this reduction 4070 ReductionResumeValues.insert({&RdxDesc, BCBlockPhi}); 4071 4072 // If there were stores of the reduction value to a uniform memory address 4073 // inside the loop, create the final store here. 4074 if (StoreInst *SI = RdxDesc.IntermediateStore) { 4075 StoreInst *NewSI = 4076 Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand()); 4077 propagateMetadata(NewSI, SI); 4078 4079 // If the reduction value is used in other places, 4080 // then let the code below create PHI's for that. 4081 } 4082 4083 // Now, we need to fix the users of the reduction variable 4084 // inside and outside of the scalar remainder loop. 4085 4086 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4087 // in the exit blocks. See comment on analogous loop in 4088 // fixFixedOrderRecurrence for a more complete explaination of the logic. 4089 if (!Cost->requiresScalarEpilogue(VF.isVector())) 4090 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4091 if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) { 4092 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4093 State.Plan->removeLiveOut(&LCSSAPhi); 4094 } 4095 4096 // Fix the scalar loop reduction variable with the incoming reduction sum 4097 // from the vector body and from the backedge value. 4098 int IncomingEdgeBlockIdx = 4099 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4100 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4101 // Pick the other block. 4102 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4103 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4104 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4105 } 4106 4107 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4108 // The basic block and loop containing the predicated instruction. 4109 auto *PredBB = PredInst->getParent(); 4110 auto *VectorLoop = LI->getLoopFor(PredBB); 4111 4112 // Initialize a worklist with the operands of the predicated instruction. 4113 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4114 4115 // Holds instructions that we need to analyze again. An instruction may be 4116 // reanalyzed if we don't yet know if we can sink it or not. 4117 SmallVector<Instruction *, 8> InstsToReanalyze; 4118 4119 // Returns true if a given use occurs in the predicated block. Phi nodes use 4120 // their operands in their corresponding predecessor blocks. 4121 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4122 auto *I = cast<Instruction>(U.getUser()); 4123 BasicBlock *BB = I->getParent(); 4124 if (auto *Phi = dyn_cast<PHINode>(I)) 4125 BB = Phi->getIncomingBlock( 4126 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4127 return BB == PredBB; 4128 }; 4129 4130 // Iteratively sink the scalarized operands of the predicated instruction 4131 // into the block we created for it. When an instruction is sunk, it's 4132 // operands are then added to the worklist. The algorithm ends after one pass 4133 // through the worklist doesn't sink a single instruction. 4134 bool Changed; 4135 do { 4136 // Add the instructions that need to be reanalyzed to the worklist, and 4137 // reset the changed indicator. 4138 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4139 InstsToReanalyze.clear(); 4140 Changed = false; 4141 4142 while (!Worklist.empty()) { 4143 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4144 4145 // We can't sink an instruction if it is a phi node, is not in the loop, 4146 // may have side effects or may read from memory. 4147 // TODO Could dor more granular checking to allow sinking a load past non-store instructions. 4148 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 4149 I->mayHaveSideEffects() || I->mayReadFromMemory()) 4150 continue; 4151 4152 // If the instruction is already in PredBB, check if we can sink its 4153 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 4154 // sinking the scalar instruction I, hence it appears in PredBB; but it 4155 // may have failed to sink I's operands (recursively), which we try 4156 // (again) here. 4157 if (I->getParent() == PredBB) { 4158 Worklist.insert(I->op_begin(), I->op_end()); 4159 continue; 4160 } 4161 4162 // It's legal to sink the instruction if all its uses occur in the 4163 // predicated block. Otherwise, there's nothing to do yet, and we may 4164 // need to reanalyze the instruction. 4165 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4166 InstsToReanalyze.push_back(I); 4167 continue; 4168 } 4169 4170 // Move the instruction to the beginning of the predicated block, and add 4171 // it's operands to the worklist. 4172 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4173 Worklist.insert(I->op_begin(), I->op_end()); 4174 4175 // The sinking may have enabled other instructions to be sunk, so we will 4176 // need to iterate. 4177 Changed = true; 4178 } 4179 } while (Changed); 4180 } 4181 4182 void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan, 4183 VPTransformState &State) { 4184 auto Iter = vp_depth_first_deep(Plan.getEntry()); 4185 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 4186 for (VPRecipeBase &P : VPBB->phis()) { 4187 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P); 4188 if (!VPPhi) 4189 continue; 4190 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4191 // Make sure the builder has a valid insert point. 4192 Builder.SetInsertPoint(NewPhi); 4193 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4194 VPValue *Inc = VPPhi->getIncomingValue(i); 4195 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4196 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4197 } 4198 } 4199 } 4200 } 4201 4202 bool InnerLoopVectorizer::useOrderedReductions( 4203 const RecurrenceDescriptor &RdxDesc) { 4204 return Cost->useOrderedReductions(RdxDesc); 4205 } 4206 4207 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4208 // We should not collect Scalars more than once per VF. Right now, this 4209 // function is called from collectUniformsAndScalars(), which already does 4210 // this check. Collecting Scalars for VF=1 does not make any sense. 4211 assert(VF.isVector() && !Scalars.contains(VF) && 4212 "This function should not be visited twice for the same VF"); 4213 4214 // This avoids any chances of creating a REPLICATE recipe during planning 4215 // since that would result in generation of scalarized code during execution, 4216 // which is not supported for scalable vectors. 4217 if (VF.isScalable()) { 4218 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4219 return; 4220 } 4221 4222 SmallSetVector<Instruction *, 8> Worklist; 4223 4224 // These sets are used to seed the analysis with pointers used by memory 4225 // accesses that will remain scalar. 4226 SmallSetVector<Instruction *, 8> ScalarPtrs; 4227 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4228 auto *Latch = TheLoop->getLoopLatch(); 4229 4230 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4231 // The pointer operands of loads and stores will be scalar as long as the 4232 // memory access is not a gather or scatter operation. The value operand of a 4233 // store will remain scalar if the store is scalarized. 4234 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4235 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4236 assert(WideningDecision != CM_Unknown && 4237 "Widening decision should be ready at this moment"); 4238 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4239 if (Ptr == Store->getValueOperand()) 4240 return WideningDecision == CM_Scalarize; 4241 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4242 "Ptr is neither a value or pointer operand"); 4243 return WideningDecision != CM_GatherScatter; 4244 }; 4245 4246 // A helper that returns true if the given value is a bitcast or 4247 // getelementptr instruction contained in the loop. 4248 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4249 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4250 isa<GetElementPtrInst>(V)) && 4251 !TheLoop->isLoopInvariant(V); 4252 }; 4253 4254 // A helper that evaluates a memory access's use of a pointer. If the use will 4255 // be a scalar use and the pointer is only used by memory accesses, we place 4256 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4257 // PossibleNonScalarPtrs. 4258 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4259 // We only care about bitcast and getelementptr instructions contained in 4260 // the loop. 4261 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4262 return; 4263 4264 // If the pointer has already been identified as scalar (e.g., if it was 4265 // also identified as uniform), there's nothing to do. 4266 auto *I = cast<Instruction>(Ptr); 4267 if (Worklist.count(I)) 4268 return; 4269 4270 // If the use of the pointer will be a scalar use, and all users of the 4271 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4272 // place the pointer in PossibleNonScalarPtrs. 4273 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4274 return isa<LoadInst>(U) || isa<StoreInst>(U); 4275 })) 4276 ScalarPtrs.insert(I); 4277 else 4278 PossibleNonScalarPtrs.insert(I); 4279 }; 4280 4281 // We seed the scalars analysis with three classes of instructions: (1) 4282 // instructions marked uniform-after-vectorization and (2) bitcast, 4283 // getelementptr and (pointer) phi instructions used by memory accesses 4284 // requiring a scalar use. 4285 // 4286 // (1) Add to the worklist all instructions that have been identified as 4287 // uniform-after-vectorization. 4288 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4289 4290 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4291 // memory accesses requiring a scalar use. The pointer operands of loads and 4292 // stores will be scalar as long as the memory accesses is not a gather or 4293 // scatter operation. The value operand of a store will remain scalar if the 4294 // store is scalarized. 4295 for (auto *BB : TheLoop->blocks()) 4296 for (auto &I : *BB) { 4297 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4298 evaluatePtrUse(Load, Load->getPointerOperand()); 4299 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4300 evaluatePtrUse(Store, Store->getPointerOperand()); 4301 evaluatePtrUse(Store, Store->getValueOperand()); 4302 } 4303 } 4304 for (auto *I : ScalarPtrs) 4305 if (!PossibleNonScalarPtrs.count(I)) { 4306 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4307 Worklist.insert(I); 4308 } 4309 4310 // Insert the forced scalars. 4311 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector 4312 // induction variable when the PHI user is scalarized. 4313 auto ForcedScalar = ForcedScalars.find(VF); 4314 if (ForcedScalar != ForcedScalars.end()) 4315 for (auto *I : ForcedScalar->second) { 4316 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n"); 4317 Worklist.insert(I); 4318 } 4319 4320 // Expand the worklist by looking through any bitcasts and getelementptr 4321 // instructions we've already identified as scalar. This is similar to the 4322 // expansion step in collectLoopUniforms(); however, here we're only 4323 // expanding to include additional bitcasts and getelementptr instructions. 4324 unsigned Idx = 0; 4325 while (Idx != Worklist.size()) { 4326 Instruction *Dst = Worklist[Idx++]; 4327 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4328 continue; 4329 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4330 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4331 auto *J = cast<Instruction>(U); 4332 return !TheLoop->contains(J) || Worklist.count(J) || 4333 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4334 isScalarUse(J, Src)); 4335 })) { 4336 Worklist.insert(Src); 4337 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4338 } 4339 } 4340 4341 // An induction variable will remain scalar if all users of the induction 4342 // variable and induction variable update remain scalar. 4343 for (const auto &Induction : Legal->getInductionVars()) { 4344 auto *Ind = Induction.first; 4345 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4346 4347 // If tail-folding is applied, the primary induction variable will be used 4348 // to feed a vector compare. 4349 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 4350 continue; 4351 4352 // Returns true if \p Indvar is a pointer induction that is used directly by 4353 // load/store instruction \p I. 4354 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, 4355 Instruction *I) { 4356 return Induction.second.getKind() == 4357 InductionDescriptor::IK_PtrInduction && 4358 (isa<LoadInst>(I) || isa<StoreInst>(I)) && 4359 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar); 4360 }; 4361 4362 // Determine if all users of the induction variable are scalar after 4363 // vectorization. 4364 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4365 auto *I = cast<Instruction>(U); 4366 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4367 IsDirectLoadStoreFromPtrIndvar(Ind, I); 4368 }); 4369 if (!ScalarInd) 4370 continue; 4371 4372 // Determine if all users of the induction variable update instruction are 4373 // scalar after vectorization. 4374 auto ScalarIndUpdate = 4375 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4376 auto *I = cast<Instruction>(U); 4377 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4378 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I); 4379 }); 4380 if (!ScalarIndUpdate) 4381 continue; 4382 4383 // The induction variable and its update instruction will remain scalar. 4384 Worklist.insert(Ind); 4385 Worklist.insert(IndUpdate); 4386 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4387 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4388 << "\n"); 4389 } 4390 4391 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4392 } 4393 4394 bool LoopVectorizationCostModel::isScalarWithPredication( 4395 Instruction *I, ElementCount VF) const { 4396 if (!isPredicatedInst(I)) 4397 return false; 4398 4399 // Do we have a non-scalar lowering for this predicated 4400 // instruction? No - it is scalar with predication. 4401 switch(I->getOpcode()) { 4402 default: 4403 return true; 4404 case Instruction::Call: 4405 return !VFDatabase::hasMaskedVariant(*(cast<CallInst>(I)), VF); 4406 case Instruction::Load: 4407 case Instruction::Store: { 4408 auto *Ptr = getLoadStorePointerOperand(I); 4409 auto *Ty = getLoadStoreType(I); 4410 Type *VTy = Ty; 4411 if (VF.isVector()) 4412 VTy = VectorType::get(Ty, VF); 4413 const Align Alignment = getLoadStoreAlignment(I); 4414 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4415 TTI.isLegalMaskedGather(VTy, Alignment)) 4416 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4417 TTI.isLegalMaskedScatter(VTy, Alignment)); 4418 } 4419 case Instruction::UDiv: 4420 case Instruction::SDiv: 4421 case Instruction::SRem: 4422 case Instruction::URem: { 4423 // We have the option to use the safe-divisor idiom to avoid predication. 4424 // The cost based decision here will always select safe-divisor for 4425 // scalable vectors as scalarization isn't legal. 4426 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF); 4427 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost); 4428 } 4429 } 4430 } 4431 4432 bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const { 4433 if (!blockNeedsPredicationForAnyReason(I->getParent())) 4434 return false; 4435 4436 // Can we prove this instruction is safe to unconditionally execute? 4437 // If not, we must use some form of predication. 4438 switch(I->getOpcode()) { 4439 default: 4440 return false; 4441 case Instruction::Load: 4442 case Instruction::Store: { 4443 if (!Legal->isMaskRequired(I)) 4444 return false; 4445 // When we know the load's address is loop invariant and the instruction 4446 // in the original scalar loop was unconditionally executed then we 4447 // don't need to mark it as a predicated instruction. Tail folding may 4448 // introduce additional predication, but we're guaranteed to always have 4449 // at least one active lane. We call Legal->blockNeedsPredication here 4450 // because it doesn't query tail-folding. For stores, we need to prove 4451 // both speculation safety (which follows from the same argument as loads), 4452 // but also must prove the value being stored is correct. The easiest 4453 // form of the later is to require that all values stored are the same. 4454 if (Legal->isInvariant(getLoadStorePointerOperand(I)) && 4455 (isa<LoadInst>(I) || 4456 (isa<StoreInst>(I) && 4457 TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) && 4458 !Legal->blockNeedsPredication(I->getParent())) 4459 return false; 4460 return true; 4461 } 4462 case Instruction::UDiv: 4463 case Instruction::SDiv: 4464 case Instruction::SRem: 4465 case Instruction::URem: 4466 // TODO: We can use the loop-preheader as context point here and get 4467 // context sensitive reasoning 4468 return !isSafeToSpeculativelyExecute(I); 4469 case Instruction::Call: 4470 return Legal->isMaskRequired(I); 4471 } 4472 } 4473 4474 std::pair<InstructionCost, InstructionCost> 4475 LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I, 4476 ElementCount VF) const { 4477 assert(I->getOpcode() == Instruction::UDiv || 4478 I->getOpcode() == Instruction::SDiv || 4479 I->getOpcode() == Instruction::SRem || 4480 I->getOpcode() == Instruction::URem); 4481 assert(!isSafeToSpeculativelyExecute(I)); 4482 4483 const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 4484 4485 // Scalarization isn't legal for scalable vector types 4486 InstructionCost ScalarizationCost = InstructionCost::getInvalid(); 4487 if (!VF.isScalable()) { 4488 // Get the scalarization cost and scale this amount by the probability of 4489 // executing the predicated block. If the instruction is not predicated, 4490 // we fall through to the next case. 4491 ScalarizationCost = 0; 4492 4493 // These instructions have a non-void type, so account for the phi nodes 4494 // that we will create. This cost is likely to be zero. The phi node 4495 // cost, if any, should be scaled by the block probability because it 4496 // models a copy at the end of each predicated block. 4497 ScalarizationCost += VF.getKnownMinValue() * 4498 TTI.getCFInstrCost(Instruction::PHI, CostKind); 4499 4500 // The cost of the non-predicated instruction. 4501 ScalarizationCost += VF.getKnownMinValue() * 4502 TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind); 4503 4504 // The cost of insertelement and extractelement instructions needed for 4505 // scalarization. 4506 ScalarizationCost += getScalarizationOverhead(I, VF, CostKind); 4507 4508 // Scale the cost by the probability of executing the predicated blocks. 4509 // This assumes the predicated block for each vector lane is equally 4510 // likely. 4511 ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb(); 4512 } 4513 InstructionCost SafeDivisorCost = 0; 4514 4515 auto *VecTy = ToVectorTy(I->getType(), VF); 4516 4517 // The cost of the select guard to ensure all lanes are well defined 4518 // after we speculate above any internal control flow. 4519 SafeDivisorCost += TTI.getCmpSelInstrCost( 4520 Instruction::Select, VecTy, 4521 ToVectorTy(Type::getInt1Ty(I->getContext()), VF), 4522 CmpInst::BAD_ICMP_PREDICATE, CostKind); 4523 4524 // Certain instructions can be cheaper to vectorize if they have a constant 4525 // second vector operand. One example of this are shifts on x86. 4526 Value *Op2 = I->getOperand(1); 4527 auto Op2Info = TTI.getOperandInfo(Op2); 4528 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && 4529 Legal->isInvariant(Op2)) 4530 Op2Info.Kind = TargetTransformInfo::OK_UniformValue; 4531 4532 SmallVector<const Value *, 4> Operands(I->operand_values()); 4533 SafeDivisorCost += TTI.getArithmeticInstrCost( 4534 I->getOpcode(), VecTy, CostKind, 4535 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 4536 Op2Info, Operands, I); 4537 return {ScalarizationCost, SafeDivisorCost}; 4538 } 4539 4540 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 4541 Instruction *I, ElementCount VF) { 4542 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4543 assert(getWideningDecision(I, VF) == CM_Unknown && 4544 "Decision should not be set yet."); 4545 auto *Group = getInterleavedAccessGroup(I); 4546 assert(Group && "Must have a group."); 4547 4548 // If the instruction's allocated size doesn't equal it's type size, it 4549 // requires padding and will be scalarized. 4550 auto &DL = I->getModule()->getDataLayout(); 4551 auto *ScalarTy = getLoadStoreType(I); 4552 if (hasIrregularType(ScalarTy, DL)) 4553 return false; 4554 4555 // If the group involves a non-integral pointer, we may not be able to 4556 // losslessly cast all values to a common type. 4557 unsigned InterleaveFactor = Group->getFactor(); 4558 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy); 4559 for (unsigned i = 0; i < InterleaveFactor; i++) { 4560 Instruction *Member = Group->getMember(i); 4561 if (!Member) 4562 continue; 4563 auto *MemberTy = getLoadStoreType(Member); 4564 bool MemberNI = DL.isNonIntegralPointerType(MemberTy); 4565 // Don't coerce non-integral pointers to integers or vice versa. 4566 if (MemberNI != ScalarNI) { 4567 // TODO: Consider adding special nullptr value case here 4568 return false; 4569 } else if (MemberNI && ScalarNI && 4570 ScalarTy->getPointerAddressSpace() != 4571 MemberTy->getPointerAddressSpace()) { 4572 return false; 4573 } 4574 } 4575 4576 // Check if masking is required. 4577 // A Group may need masking for one of two reasons: it resides in a block that 4578 // needs predication, or it was decided to use masking to deal with gaps 4579 // (either a gap at the end of a load-access that may result in a speculative 4580 // load, or any gaps in a store-access). 4581 bool PredicatedAccessRequiresMasking = 4582 blockNeedsPredicationForAnyReason(I->getParent()) && 4583 Legal->isMaskRequired(I); 4584 bool LoadAccessWithGapsRequiresEpilogMasking = 4585 isa<LoadInst>(I) && Group->requiresScalarEpilogue() && 4586 !isScalarEpilogueAllowed(); 4587 bool StoreAccessWithGapsRequiresMasking = 4588 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); 4589 if (!PredicatedAccessRequiresMasking && 4590 !LoadAccessWithGapsRequiresEpilogMasking && 4591 !StoreAccessWithGapsRequiresMasking) 4592 return true; 4593 4594 // If masked interleaving is required, we expect that the user/target had 4595 // enabled it, because otherwise it either wouldn't have been created or 4596 // it should have been invalidated by the CostModel. 4597 assert(useMaskedInterleavedAccesses(TTI) && 4598 "Masked interleave-groups for predicated accesses are not enabled."); 4599 4600 if (Group->isReverse()) 4601 return false; 4602 4603 auto *Ty = getLoadStoreType(I); 4604 const Align Alignment = getLoadStoreAlignment(I); 4605 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4606 : TTI.isLegalMaskedStore(Ty, Alignment); 4607 } 4608 4609 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 4610 Instruction *I, ElementCount VF) { 4611 // Get and ensure we have a valid memory instruction. 4612 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction"); 4613 4614 auto *Ptr = getLoadStorePointerOperand(I); 4615 auto *ScalarTy = getLoadStoreType(I); 4616 4617 // In order to be widened, the pointer should be consecutive, first of all. 4618 if (!Legal->isConsecutivePtr(ScalarTy, Ptr)) 4619 return false; 4620 4621 // If the instruction is a store located in a predicated block, it will be 4622 // scalarized. 4623 if (isScalarWithPredication(I, VF)) 4624 return false; 4625 4626 // If the instruction's allocated size doesn't equal it's type size, it 4627 // requires padding and will be scalarized. 4628 auto &DL = I->getModule()->getDataLayout(); 4629 if (hasIrregularType(ScalarTy, DL)) 4630 return false; 4631 4632 return true; 4633 } 4634 4635 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 4636 // We should not collect Uniforms more than once per VF. Right now, 4637 // this function is called from collectUniformsAndScalars(), which 4638 // already does this check. Collecting Uniforms for VF=1 does not make any 4639 // sense. 4640 4641 assert(VF.isVector() && !Uniforms.contains(VF) && 4642 "This function should not be visited twice for the same VF"); 4643 4644 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4645 // not analyze again. Uniforms.count(VF) will return 1. 4646 Uniforms[VF].clear(); 4647 4648 // We now know that the loop is vectorizable! 4649 // Collect instructions inside the loop that will remain uniform after 4650 // vectorization. 4651 4652 // Global values, params and instructions outside of current loop are out of 4653 // scope. 4654 auto isOutOfScope = [&](Value *V) -> bool { 4655 Instruction *I = dyn_cast<Instruction>(V); 4656 return (!I || !TheLoop->contains(I)); 4657 }; 4658 4659 // Worklist containing uniform instructions demanding lane 0. 4660 SetVector<Instruction *> Worklist; 4661 BasicBlock *Latch = TheLoop->getLoopLatch(); 4662 4663 // Add uniform instructions demanding lane 0 to the worklist. Instructions 4664 // that are scalar with predication must not be considered uniform after 4665 // vectorization, because that would create an erroneous replicating region 4666 // where only a single instance out of VF should be formed. 4667 // TODO: optimize such seldom cases if found important, see PR40816. 4668 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 4669 if (isOutOfScope(I)) { 4670 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 4671 << *I << "\n"); 4672 return; 4673 } 4674 if (isScalarWithPredication(I, VF)) { 4675 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 4676 << *I << "\n"); 4677 return; 4678 } 4679 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 4680 Worklist.insert(I); 4681 }; 4682 4683 // Start with the conditional branch. If the branch condition is an 4684 // instruction contained in the loop that is only used by the branch, it is 4685 // uniform. 4686 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4687 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 4688 addToWorklistIfAllowed(Cmp); 4689 4690 auto PrevVF = VF.divideCoefficientBy(2); 4691 // Return true if all lanes perform the same memory operation, and we can 4692 // thus chose to execute only one. 4693 auto isUniformMemOpUse = [&](Instruction *I) { 4694 // If the value was already known to not be uniform for the previous 4695 // (smaller VF), it cannot be uniform for the larger VF. 4696 if (PrevVF.isVector()) { 4697 auto Iter = Uniforms.find(PrevVF); 4698 if (Iter != Uniforms.end() && !Iter->second.contains(I)) 4699 return false; 4700 } 4701 if (!Legal->isUniformMemOp(*I, VF)) 4702 return false; 4703 if (isa<LoadInst>(I)) 4704 // Loading the same address always produces the same result - at least 4705 // assuming aliasing and ordering which have already been checked. 4706 return true; 4707 // Storing the same value on every iteration. 4708 return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()); 4709 }; 4710 4711 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 4712 InstWidening WideningDecision = getWideningDecision(I, VF); 4713 assert(WideningDecision != CM_Unknown && 4714 "Widening decision should be ready at this moment"); 4715 4716 if (isUniformMemOpUse(I)) 4717 return true; 4718 4719 return (WideningDecision == CM_Widen || 4720 WideningDecision == CM_Widen_Reverse || 4721 WideningDecision == CM_Interleave); 4722 }; 4723 4724 // Returns true if Ptr is the pointer operand of a memory access instruction 4725 // I, I is known to not require scalarization, and the pointer is not also 4726 // stored. 4727 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 4728 if (isa<StoreInst>(I) && I->getOperand(0) == Ptr) 4729 return false; 4730 return getLoadStorePointerOperand(I) == Ptr && 4731 (isUniformDecision(I, VF) || Legal->isInvariant(Ptr)); 4732 }; 4733 4734 // Holds a list of values which are known to have at least one uniform use. 4735 // Note that there may be other uses which aren't uniform. A "uniform use" 4736 // here is something which only demands lane 0 of the unrolled iterations; 4737 // it does not imply that all lanes produce the same value (e.g. this is not 4738 // the usual meaning of uniform) 4739 SetVector<Value *> HasUniformUse; 4740 4741 // Scan the loop for instructions which are either a) known to have only 4742 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 4743 for (auto *BB : TheLoop->blocks()) 4744 for (auto &I : *BB) { 4745 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { 4746 switch (II->getIntrinsicID()) { 4747 case Intrinsic::sideeffect: 4748 case Intrinsic::experimental_noalias_scope_decl: 4749 case Intrinsic::assume: 4750 case Intrinsic::lifetime_start: 4751 case Intrinsic::lifetime_end: 4752 if (TheLoop->hasLoopInvariantOperands(&I)) 4753 addToWorklistIfAllowed(&I); 4754 break; 4755 default: 4756 break; 4757 } 4758 } 4759 4760 // ExtractValue instructions must be uniform, because the operands are 4761 // known to be loop-invariant. 4762 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { 4763 assert(isOutOfScope(EVI->getAggregateOperand()) && 4764 "Expected aggregate value to be loop invariant"); 4765 addToWorklistIfAllowed(EVI); 4766 continue; 4767 } 4768 4769 // If there's no pointer operand, there's nothing to do. 4770 auto *Ptr = getLoadStorePointerOperand(&I); 4771 if (!Ptr) 4772 continue; 4773 4774 if (isUniformMemOpUse(&I)) 4775 addToWorklistIfAllowed(&I); 4776 4777 if (isVectorizedMemAccessUse(&I, Ptr)) 4778 HasUniformUse.insert(Ptr); 4779 } 4780 4781 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 4782 // demanding) users. Since loops are assumed to be in LCSSA form, this 4783 // disallows uses outside the loop as well. 4784 for (auto *V : HasUniformUse) { 4785 if (isOutOfScope(V)) 4786 continue; 4787 auto *I = cast<Instruction>(V); 4788 auto UsersAreMemAccesses = 4789 llvm::all_of(I->users(), [&](User *U) -> bool { 4790 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 4791 }); 4792 if (UsersAreMemAccesses) 4793 addToWorklistIfAllowed(I); 4794 } 4795 4796 // Expand Worklist in topological order: whenever a new instruction 4797 // is added , its users should be already inside Worklist. It ensures 4798 // a uniform instruction will only be used by uniform instructions. 4799 unsigned idx = 0; 4800 while (idx != Worklist.size()) { 4801 Instruction *I = Worklist[idx++]; 4802 4803 for (auto *OV : I->operand_values()) { 4804 // isOutOfScope operands cannot be uniform instructions. 4805 if (isOutOfScope(OV)) 4806 continue; 4807 // First order recurrence Phi's should typically be considered 4808 // non-uniform. 4809 auto *OP = dyn_cast<PHINode>(OV); 4810 if (OP && Legal->isFixedOrderRecurrence(OP)) 4811 continue; 4812 // If all the users of the operand are uniform, then add the 4813 // operand into the uniform worklist. 4814 auto *OI = cast<Instruction>(OV); 4815 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 4816 auto *J = cast<Instruction>(U); 4817 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 4818 })) 4819 addToWorklistIfAllowed(OI); 4820 } 4821 } 4822 4823 // For an instruction to be added into Worklist above, all its users inside 4824 // the loop should also be in Worklist. However, this condition cannot be 4825 // true for phi nodes that form a cyclic dependence. We must process phi 4826 // nodes separately. An induction variable will remain uniform if all users 4827 // of the induction variable and induction variable update remain uniform. 4828 // The code below handles both pointer and non-pointer induction variables. 4829 for (const auto &Induction : Legal->getInductionVars()) { 4830 auto *Ind = Induction.first; 4831 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4832 4833 // Determine if all users of the induction variable are uniform after 4834 // vectorization. 4835 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4836 auto *I = cast<Instruction>(U); 4837 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4838 isVectorizedMemAccessUse(I, Ind); 4839 }); 4840 if (!UniformInd) 4841 continue; 4842 4843 // Determine if all users of the induction variable update instruction are 4844 // uniform after vectorization. 4845 auto UniformIndUpdate = 4846 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4847 auto *I = cast<Instruction>(U); 4848 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4849 isVectorizedMemAccessUse(I, IndUpdate); 4850 }); 4851 if (!UniformIndUpdate) 4852 continue; 4853 4854 // The induction variable and its update instruction will remain uniform. 4855 addToWorklistIfAllowed(Ind); 4856 addToWorklistIfAllowed(IndUpdate); 4857 } 4858 4859 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 4860 } 4861 4862 bool LoopVectorizationCostModel::runtimeChecksRequired() { 4863 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 4864 4865 if (Legal->getRuntimePointerChecking()->Need) { 4866 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 4867 "runtime pointer checks needed. Enable vectorization of this " 4868 "loop with '#pragma clang loop vectorize(enable)' when " 4869 "compiling with -Os/-Oz", 4870 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4871 return true; 4872 } 4873 4874 if (!PSE.getPredicate().isAlwaysTrue()) { 4875 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 4876 "runtime SCEV checks needed. Enable vectorization of this " 4877 "loop with '#pragma clang loop vectorize(enable)' when " 4878 "compiling with -Os/-Oz", 4879 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4880 return true; 4881 } 4882 4883 // FIXME: Avoid specializing for stride==1 instead of bailing out. 4884 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 4885 reportVectorizationFailure("Runtime stride check for small trip count", 4886 "runtime stride == 1 checks needed. Enable vectorization of " 4887 "this loop without such check by compiling with -Os/-Oz", 4888 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4889 return true; 4890 } 4891 4892 return false; 4893 } 4894 4895 ElementCount 4896 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 4897 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) 4898 return ElementCount::getScalable(0); 4899 4900 if (Hints->isScalableVectorizationDisabled()) { 4901 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 4902 "ScalableVectorizationDisabled", ORE, TheLoop); 4903 return ElementCount::getScalable(0); 4904 } 4905 4906 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); 4907 4908 auto MaxScalableVF = ElementCount::getScalable( 4909 std::numeric_limits<ElementCount::ScalarTy>::max()); 4910 4911 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 4912 // FIXME: While for scalable vectors this is currently sufficient, this should 4913 // be replaced by a more detailed mechanism that filters out specific VFs, 4914 // instead of invalidating vectorization for a whole set of VFs based on the 4915 // MaxVF. 4916 4917 // Disable scalable vectorization if the loop contains unsupported reductions. 4918 if (!canVectorizeReductions(MaxScalableVF)) { 4919 reportVectorizationInfo( 4920 "Scalable vectorization not supported for the reduction " 4921 "operations found in this loop.", 4922 "ScalableVFUnfeasible", ORE, TheLoop); 4923 return ElementCount::getScalable(0); 4924 } 4925 4926 // Disable scalable vectorization if the loop contains any instructions 4927 // with element types not supported for scalable vectors. 4928 if (any_of(ElementTypesInLoop, [&](Type *Ty) { 4929 return !Ty->isVoidTy() && 4930 !this->TTI.isElementTypeLegalForScalableVector(Ty); 4931 })) { 4932 reportVectorizationInfo("Scalable vectorization is not supported " 4933 "for all element types found in this loop.", 4934 "ScalableVFUnfeasible", ORE, TheLoop); 4935 return ElementCount::getScalable(0); 4936 } 4937 4938 if (Legal->isSafeForAnyVectorWidth()) 4939 return MaxScalableVF; 4940 4941 // Limit MaxScalableVF by the maximum safe dependence distance. 4942 if (std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI)) 4943 MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale); 4944 else 4945 MaxScalableVF = ElementCount::getScalable(0); 4946 4947 if (!MaxScalableVF) 4948 reportVectorizationInfo( 4949 "Max legal vector width too small, scalable vectorization " 4950 "unfeasible.", 4951 "ScalableVFUnfeasible", ORE, TheLoop); 4952 4953 return MaxScalableVF; 4954 } 4955 4956 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( 4957 unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) { 4958 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 4959 unsigned SmallestType, WidestType; 4960 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 4961 4962 // Get the maximum safe dependence distance in bits computed by LAA. 4963 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 4964 // the memory accesses that is most restrictive (involved in the smallest 4965 // dependence distance). 4966 unsigned MaxSafeElements = 4967 llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 4968 4969 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 4970 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 4971 4972 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 4973 << ".\n"); 4974 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 4975 << ".\n"); 4976 4977 // First analyze the UserVF, fall back if the UserVF should be ignored. 4978 if (UserVF) { 4979 auto MaxSafeUserVF = 4980 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 4981 4982 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { 4983 // If `VF=vscale x N` is safe, then so is `VF=N` 4984 if (UserVF.isScalable()) 4985 return FixedScalableVFPair( 4986 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); 4987 else 4988 return UserVF; 4989 } 4990 4991 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 4992 4993 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 4994 // is better to ignore the hint and let the compiler choose a suitable VF. 4995 if (!UserVF.isScalable()) { 4996 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 4997 << " is unsafe, clamping to max safe VF=" 4998 << MaxSafeFixedVF << ".\n"); 4999 ORE->emit([&]() { 5000 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5001 TheLoop->getStartLoc(), 5002 TheLoop->getHeader()) 5003 << "User-specified vectorization factor " 5004 << ore::NV("UserVectorizationFactor", UserVF) 5005 << " is unsafe, clamping to maximum safe vectorization factor " 5006 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 5007 }); 5008 return MaxSafeFixedVF; 5009 } 5010 5011 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 5012 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5013 << " is ignored because scalable vectors are not " 5014 "available.\n"); 5015 ORE->emit([&]() { 5016 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5017 TheLoop->getStartLoc(), 5018 TheLoop->getHeader()) 5019 << "User-specified vectorization factor " 5020 << ore::NV("UserVectorizationFactor", UserVF) 5021 << " is ignored because the target does not support scalable " 5022 "vectors. The compiler will pick a more suitable value."; 5023 }); 5024 } else { 5025 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5026 << " is unsafe. Ignoring scalable UserVF.\n"); 5027 ORE->emit([&]() { 5028 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5029 TheLoop->getStartLoc(), 5030 TheLoop->getHeader()) 5031 << "User-specified vectorization factor " 5032 << ore::NV("UserVectorizationFactor", UserVF) 5033 << " is unsafe. Ignoring the hint to let the compiler pick a " 5034 "more suitable value."; 5035 }); 5036 } 5037 } 5038 5039 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5040 << " / " << WidestType << " bits.\n"); 5041 5042 FixedScalableVFPair Result(ElementCount::getFixed(1), 5043 ElementCount::getScalable(0)); 5044 if (auto MaxVF = 5045 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 5046 MaxSafeFixedVF, FoldTailByMasking)) 5047 Result.FixedVF = MaxVF; 5048 5049 if (auto MaxVF = 5050 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 5051 MaxSafeScalableVF, FoldTailByMasking)) 5052 if (MaxVF.isScalable()) { 5053 Result.ScalableVF = MaxVF; 5054 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 5055 << "\n"); 5056 } 5057 5058 return Result; 5059 } 5060 5061 FixedScalableVFPair 5062 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5063 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5064 // TODO: It may by useful to do since it's still likely to be dynamically 5065 // uniform if the target can skip. 5066 reportVectorizationFailure( 5067 "Not inserting runtime ptr check for divergent target", 5068 "runtime pointer checks needed. Not enabled for divergent target", 5069 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5070 return FixedScalableVFPair::getNone(); 5071 } 5072 5073 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5074 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5075 if (TC == 1) { 5076 reportVectorizationFailure("Single iteration (non) loop", 5077 "loop trip count is one, irrelevant for vectorization", 5078 "SingleIterationLoop", ORE, TheLoop); 5079 return FixedScalableVFPair::getNone(); 5080 } 5081 5082 switch (ScalarEpilogueStatus) { 5083 case CM_ScalarEpilogueAllowed: 5084 return computeFeasibleMaxVF(TC, UserVF, false); 5085 case CM_ScalarEpilogueNotAllowedUsePredicate: 5086 [[fallthrough]]; 5087 case CM_ScalarEpilogueNotNeededUsePredicate: 5088 LLVM_DEBUG( 5089 dbgs() << "LV: vector predicate hint/switch found.\n" 5090 << "LV: Not allowing scalar epilogue, creating predicated " 5091 << "vector loop.\n"); 5092 break; 5093 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5094 // fallthrough as a special case of OptForSize 5095 case CM_ScalarEpilogueNotAllowedOptSize: 5096 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5097 LLVM_DEBUG( 5098 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5099 else 5100 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5101 << "count.\n"); 5102 5103 // Bail if runtime checks are required, which are not good when optimising 5104 // for size. 5105 if (runtimeChecksRequired()) 5106 return FixedScalableVFPair::getNone(); 5107 5108 break; 5109 } 5110 5111 // The only loops we can vectorize without a scalar epilogue, are loops with 5112 // a bottom-test and a single exiting block. We'd have to handle the fact 5113 // that not every instruction executes on the last iteration. This will 5114 // require a lane mask which varies through the vector loop body. (TODO) 5115 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5116 // If there was a tail-folding hint/switch, but we can't fold the tail by 5117 // masking, fallback to a vectorization with a scalar epilogue. 5118 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5119 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5120 "scalar epilogue instead.\n"); 5121 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5122 return computeFeasibleMaxVF(TC, UserVF, false); 5123 } 5124 return FixedScalableVFPair::getNone(); 5125 } 5126 5127 // Now try the tail folding 5128 5129 // Invalidate interleave groups that require an epilogue if we can't mask 5130 // the interleave-group. 5131 if (!useMaskedInterleavedAccesses(TTI)) { 5132 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5133 "No decisions should have been taken at this point"); 5134 // Note: There is no need to invalidate any cost modeling decisions here, as 5135 // non where taken so far. 5136 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5137 } 5138 5139 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true); 5140 5141 // Avoid tail folding if the trip count is known to be a multiple of any VF 5142 // we choose. 5143 std::optional<unsigned> MaxPowerOf2RuntimeVF = 5144 MaxFactors.FixedVF.getFixedValue(); 5145 if (MaxFactors.ScalableVF) { 5146 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI); 5147 if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) { 5148 MaxPowerOf2RuntimeVF = std::max<unsigned>( 5149 *MaxPowerOf2RuntimeVF, 5150 *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue()); 5151 } else 5152 MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now. 5153 } 5154 5155 if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) { 5156 assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) && 5157 "MaxFixedVF must be a power of 2"); 5158 unsigned MaxVFtimesIC = 5159 UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF; 5160 ScalarEvolution *SE = PSE.getSE(); 5161 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5162 const SCEV *ExitCount = SE->getAddExpr( 5163 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5164 const SCEV *Rem = SE->getURemExpr( 5165 SE->applyLoopGuards(ExitCount, TheLoop), 5166 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5167 if (Rem->isZero()) { 5168 // Accept MaxFixedVF if we do not have a tail. 5169 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5170 return MaxFactors; 5171 } 5172 } 5173 5174 // If we don't know the precise trip count, or if the trip count that we 5175 // found modulo the vectorization factor is not zero, try to fold the tail 5176 // by masking. 5177 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5178 if (Legal->prepareToFoldTailByMasking()) { 5179 CanFoldTailByMasking = true; 5180 return MaxFactors; 5181 } 5182 5183 // If there was a tail-folding hint/switch, but we can't fold the tail by 5184 // masking, fallback to a vectorization with a scalar epilogue. 5185 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5186 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5187 "scalar epilogue instead.\n"); 5188 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5189 return MaxFactors; 5190 } 5191 5192 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5193 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5194 return FixedScalableVFPair::getNone(); 5195 } 5196 5197 if (TC == 0) { 5198 reportVectorizationFailure( 5199 "Unable to calculate the loop count due to complex control flow", 5200 "unable to calculate the loop count due to complex control flow", 5201 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5202 return FixedScalableVFPair::getNone(); 5203 } 5204 5205 reportVectorizationFailure( 5206 "Cannot optimize for size and vectorize at the same time.", 5207 "cannot optimize for size and vectorize at the same time. " 5208 "Enable vectorization of this loop with '#pragma clang loop " 5209 "vectorize(enable)' when compiling with -Os/-Oz", 5210 "NoTailLoopWithOptForSize", ORE, TheLoop); 5211 return FixedScalableVFPair::getNone(); 5212 } 5213 5214 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 5215 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, 5216 ElementCount MaxSafeVF, bool FoldTailByMasking) { 5217 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 5218 const TypeSize WidestRegister = TTI.getRegisterBitWidth( 5219 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5220 : TargetTransformInfo::RGK_FixedWidthVector); 5221 5222 // Convenience function to return the minimum of two ElementCounts. 5223 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 5224 assert((LHS.isScalable() == RHS.isScalable()) && 5225 "Scalable flags must match"); 5226 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 5227 }; 5228 5229 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5230 // Note that both WidestRegister and WidestType may not be a powers of 2. 5231 auto MaxVectorElementCount = ElementCount::get( 5232 llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType), 5233 ComputeScalableMaxVF); 5234 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 5235 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5236 << (MaxVectorElementCount * WidestType) << " bits.\n"); 5237 5238 if (!MaxVectorElementCount) { 5239 LLVM_DEBUG(dbgs() << "LV: The target has no " 5240 << (ComputeScalableMaxVF ? "scalable" : "fixed") 5241 << " vector registers.\n"); 5242 return ElementCount::getFixed(1); 5243 } 5244 5245 unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue(); 5246 if (MaxVectorElementCount.isScalable() && 5247 TheFunction->hasFnAttribute(Attribute::VScaleRange)) { 5248 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange); 5249 auto Min = Attr.getVScaleRangeMin(); 5250 WidestRegisterMinEC *= Min; 5251 } 5252 5253 // When a scalar epilogue is required, at least one iteration of the scalar 5254 // loop has to execute. Adjust ConstTripCount accordingly to avoid picking a 5255 // max VF that results in a dead vector loop. 5256 if (ConstTripCount > 0 && requiresScalarEpilogue(true)) 5257 ConstTripCount -= 1; 5258 5259 if (ConstTripCount && ConstTripCount <= WidestRegisterMinEC && 5260 (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) { 5261 // If loop trip count (TC) is known at compile time there is no point in 5262 // choosing VF greater than TC (as done in the loop below). Select maximum 5263 // power of two which doesn't exceed TC. 5264 // If MaxVectorElementCount is scalable, we only fall back on a fixed VF 5265 // when the TC is less than or equal to the known number of lanes. 5266 auto ClampedConstTripCount = llvm::bit_floor(ConstTripCount); 5267 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " 5268 "exceeding the constant trip count: " 5269 << ClampedConstTripCount << "\n"); 5270 return ElementCount::getFixed(ClampedConstTripCount); 5271 } 5272 5273 TargetTransformInfo::RegisterKind RegKind = 5274 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5275 : TargetTransformInfo::RGK_FixedWidthVector; 5276 ElementCount MaxVF = MaxVectorElementCount; 5277 if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 && 5278 TTI.shouldMaximizeVectorBandwidth(RegKind))) { 5279 auto MaxVectorElementCountMaxBW = ElementCount::get( 5280 llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType), 5281 ComputeScalableMaxVF); 5282 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 5283 5284 // Collect all viable vectorization factors larger than the default MaxVF 5285 // (i.e. MaxVectorElementCount). 5286 SmallVector<ElementCount, 8> VFs; 5287 for (ElementCount VS = MaxVectorElementCount * 2; 5288 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 5289 VFs.push_back(VS); 5290 5291 // For each VF calculate its register usage. 5292 auto RUs = calculateRegisterUsage(VFs); 5293 5294 // Select the largest VF which doesn't require more registers than existing 5295 // ones. 5296 for (int i = RUs.size() - 1; i >= 0; --i) { 5297 bool Selected = true; 5298 for (auto &pair : RUs[i].MaxLocalUsers) { 5299 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5300 if (pair.second > TargetNumRegisters) 5301 Selected = false; 5302 } 5303 if (Selected) { 5304 MaxVF = VFs[i]; 5305 break; 5306 } 5307 } 5308 if (ElementCount MinVF = 5309 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 5310 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5311 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5312 << ") with target's minimum: " << MinVF << '\n'); 5313 MaxVF = MinVF; 5314 } 5315 } 5316 5317 // Invalidate any widening decisions we might have made, in case the loop 5318 // requires prediction (decided later), but we have already made some 5319 // load/store widening decisions. 5320 invalidateCostModelingDecisions(); 5321 } 5322 return MaxVF; 5323 } 5324 5325 /// Convenience function that returns the value of vscale_range iff 5326 /// vscale_range.min == vscale_range.max or otherwise returns the value 5327 /// returned by the corresponding TTI method. 5328 static std::optional<unsigned> 5329 getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) { 5330 const Function *Fn = L->getHeader()->getParent(); 5331 if (Fn->hasFnAttribute(Attribute::VScaleRange)) { 5332 auto Attr = Fn->getFnAttribute(Attribute::VScaleRange); 5333 auto Min = Attr.getVScaleRangeMin(); 5334 auto Max = Attr.getVScaleRangeMax(); 5335 if (Max && Min == Max) 5336 return Max; 5337 } 5338 5339 return TTI.getVScaleForTuning(); 5340 } 5341 5342 bool LoopVectorizationPlanner::isMoreProfitable( 5343 const VectorizationFactor &A, const VectorizationFactor &B) const { 5344 InstructionCost CostA = A.Cost; 5345 InstructionCost CostB = B.Cost; 5346 5347 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop); 5348 5349 if (!A.Width.isScalable() && !B.Width.isScalable() && MaxTripCount) { 5350 // If the trip count is a known (possibly small) constant, the trip count 5351 // will be rounded up to an integer number of iterations under 5352 // FoldTailByMasking. The total cost in that case will be 5353 // VecCost*ceil(TripCount/VF). When not folding the tail, the total 5354 // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be 5355 // some extra overheads, but for the purpose of comparing the costs of 5356 // different VFs we can use this to compare the total loop-body cost 5357 // expected after vectorization. 5358 auto GetCostForTC = [MaxTripCount, this](unsigned VF, 5359 InstructionCost VectorCost, 5360 InstructionCost ScalarCost) { 5361 return CM.foldTailByMasking() ? VectorCost * divideCeil(MaxTripCount, VF) 5362 : VectorCost * (MaxTripCount / VF) + 5363 ScalarCost * (MaxTripCount % VF); 5364 }; 5365 auto RTCostA = GetCostForTC(A.Width.getFixedValue(), CostA, A.ScalarCost); 5366 auto RTCostB = GetCostForTC(B.Width.getFixedValue(), CostB, B.ScalarCost); 5367 5368 return RTCostA < RTCostB; 5369 } 5370 5371 // Improve estimate for the vector width if it is scalable. 5372 unsigned EstimatedWidthA = A.Width.getKnownMinValue(); 5373 unsigned EstimatedWidthB = B.Width.getKnownMinValue(); 5374 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) { 5375 if (A.Width.isScalable()) 5376 EstimatedWidthA *= *VScale; 5377 if (B.Width.isScalable()) 5378 EstimatedWidthB *= *VScale; 5379 } 5380 5381 // Assume vscale may be larger than 1 (or the value being tuned for), 5382 // so that scalable vectorization is slightly favorable over fixed-width 5383 // vectorization. 5384 if (A.Width.isScalable() && !B.Width.isScalable()) 5385 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); 5386 5387 // To avoid the need for FP division: 5388 // (CostA / A.Width) < (CostB / B.Width) 5389 // <=> (CostA * B.Width) < (CostB * A.Width) 5390 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA); 5391 } 5392 5393 static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts, 5394 OptimizationRemarkEmitter *ORE, 5395 Loop *TheLoop) { 5396 if (InvalidCosts.empty()) 5397 return; 5398 5399 // Emit a report of VFs with invalid costs in the loop. 5400 5401 // Group the remarks per instruction, keeping the instruction order from 5402 // InvalidCosts. 5403 std::map<Instruction *, unsigned> Numbering; 5404 unsigned I = 0; 5405 for (auto &Pair : InvalidCosts) 5406 if (!Numbering.count(Pair.first)) 5407 Numbering[Pair.first] = I++; 5408 5409 // Sort the list, first on instruction(number) then on VF. 5410 sort(InvalidCosts, [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { 5411 if (Numbering[A.first] != Numbering[B.first]) 5412 return Numbering[A.first] < Numbering[B.first]; 5413 ElementCountComparator ECC; 5414 return ECC(A.second, B.second); 5415 }); 5416 5417 // For a list of ordered instruction-vf pairs: 5418 // [(load, vf1), (load, vf2), (store, vf1)] 5419 // Group the instructions together to emit separate remarks for: 5420 // load (vf1, vf2) 5421 // store (vf1) 5422 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); 5423 auto Subset = ArrayRef<InstructionVFPair>(); 5424 do { 5425 if (Subset.empty()) 5426 Subset = Tail.take_front(1); 5427 5428 Instruction *I = Subset.front().first; 5429 5430 // If the next instruction is different, or if there are no other pairs, 5431 // emit a remark for the collated subset. e.g. 5432 // [(load, vf1), (load, vf2))] 5433 // to emit: 5434 // remark: invalid costs for 'load' at VF=(vf, vf2) 5435 if (Subset == Tail || Tail[Subset.size()].first != I) { 5436 std::string OutString; 5437 raw_string_ostream OS(OutString); 5438 assert(!Subset.empty() && "Unexpected empty range"); 5439 OS << "Instruction with invalid costs prevented vectorization at VF=("; 5440 for (const auto &Pair : Subset) 5441 OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second; 5442 OS << "):"; 5443 if (auto *CI = dyn_cast<CallInst>(I)) 5444 OS << " call to " << CI->getCalledFunction()->getName(); 5445 else 5446 OS << " " << I->getOpcodeName(); 5447 OS.flush(); 5448 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); 5449 Tail = Tail.drop_front(Subset.size()); 5450 Subset = {}; 5451 } else 5452 // Grow the subset by one element 5453 Subset = Tail.take_front(Subset.size() + 1); 5454 } while (!Tail.empty()); 5455 } 5456 5457 VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor( 5458 const ElementCountSet &VFCandidates) { 5459 InstructionCost ExpectedCost = 5460 CM.expectedCost(ElementCount::getFixed(1)).first; 5461 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5462 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5463 assert(VFCandidates.count(ElementCount::getFixed(1)) && 5464 "Expected Scalar VF to be a candidate"); 5465 5466 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost, 5467 ExpectedCost); 5468 VectorizationFactor ChosenFactor = ScalarCost; 5469 5470 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; 5471 if (ForceVectorization && VFCandidates.size() > 1) { 5472 // Ignore scalar width, because the user explicitly wants vectorization. 5473 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5474 // evaluation. 5475 ChosenFactor.Cost = InstructionCost::getMax(); 5476 } 5477 5478 SmallVector<InstructionVFPair> InvalidCosts; 5479 for (const auto &i : VFCandidates) { 5480 // The cost for scalar VF=1 is already calculated, so ignore it. 5481 if (i.isScalar()) 5482 continue; 5483 5484 LoopVectorizationCostModel::VectorizationCostTy C = 5485 CM.expectedCost(i, &InvalidCosts); 5486 VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost); 5487 5488 #ifndef NDEBUG 5489 unsigned AssumedMinimumVscale = 1; 5490 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) 5491 AssumedMinimumVscale = *VScale; 5492 unsigned Width = 5493 Candidate.Width.isScalable() 5494 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale 5495 : Candidate.Width.getFixedValue(); 5496 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5497 << " costs: " << (Candidate.Cost / Width)); 5498 if (i.isScalable()) 5499 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " 5500 << AssumedMinimumVscale << ")"); 5501 LLVM_DEBUG(dbgs() << ".\n"); 5502 #endif 5503 5504 if (!C.second && !ForceVectorization) { 5505 LLVM_DEBUG( 5506 dbgs() << "LV: Not considering vector loop of width " << i 5507 << " because it will not generate any vector instructions.\n"); 5508 continue; 5509 } 5510 5511 // If profitable add it to ProfitableVF list. 5512 if (isMoreProfitable(Candidate, ScalarCost)) 5513 ProfitableVFs.push_back(Candidate); 5514 5515 if (isMoreProfitable(Candidate, ChosenFactor)) 5516 ChosenFactor = Candidate; 5517 } 5518 5519 emitInvalidCostRemarks(InvalidCosts, ORE, OrigLoop); 5520 5521 if (!EnableCondStoresVectorization && CM.hasPredStores()) { 5522 reportVectorizationFailure( 5523 "There are conditional stores.", 5524 "store that is conditionally executed prevents vectorization", 5525 "ConditionalStore", ORE, OrigLoop); 5526 ChosenFactor = ScalarCost; 5527 } 5528 5529 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 5530 !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs() 5531 << "LV: Vectorization seems to be not beneficial, " 5532 << "but was forced by a user.\n"); 5533 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 5534 return ChosenFactor; 5535 } 5536 5537 bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization( 5538 ElementCount VF) const { 5539 // Cross iteration phis such as reductions need special handling and are 5540 // currently unsupported. 5541 if (any_of(OrigLoop->getHeader()->phis(), 5542 [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); })) 5543 return false; 5544 5545 // Phis with uses outside of the loop require special handling and are 5546 // currently unsupported. 5547 for (const auto &Entry : Legal->getInductionVars()) { 5548 // Look for uses of the value of the induction at the last iteration. 5549 Value *PostInc = 5550 Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 5551 for (User *U : PostInc->users()) 5552 if (!OrigLoop->contains(cast<Instruction>(U))) 5553 return false; 5554 // Look for uses of penultimate value of the induction. 5555 for (User *U : Entry.first->users()) 5556 if (!OrigLoop->contains(cast<Instruction>(U))) 5557 return false; 5558 } 5559 5560 // Epilogue vectorization code has not been auditted to ensure it handles 5561 // non-latch exits properly. It may be fine, but it needs auditted and 5562 // tested. 5563 if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch()) 5564 return false; 5565 5566 return true; 5567 } 5568 5569 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5570 const ElementCount VF) const { 5571 // FIXME: We need a much better cost-model to take different parameters such 5572 // as register pressure, code size increase and cost of extra branches into 5573 // account. For now we apply a very crude heuristic and only consider loops 5574 // with vectorization factors larger than a certain value. 5575 5576 // Allow the target to opt out entirely. 5577 if (!TTI.preferEpilogueVectorization()) 5578 return false; 5579 5580 // We also consider epilogue vectorization unprofitable for targets that don't 5581 // consider interleaving beneficial (eg. MVE). 5582 if (TTI.getMaxInterleaveFactor(VF) <= 1) 5583 return false; 5584 5585 unsigned Multiplier = 1; 5586 if (VF.isScalable()) 5587 Multiplier = getVScaleForTuning(TheLoop, TTI).value_or(1); 5588 if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF) 5589 return true; 5590 return false; 5591 } 5592 5593 VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( 5594 const ElementCount MainLoopVF, unsigned IC) { 5595 VectorizationFactor Result = VectorizationFactor::Disabled(); 5596 if (!EnableEpilogueVectorization) { 5597 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n"); 5598 return Result; 5599 } 5600 5601 if (!CM.isScalarEpilogueAllowed()) { 5602 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no " 5603 "epilogue is allowed.\n"); 5604 return Result; 5605 } 5606 5607 // Not really a cost consideration, but check for unsupported cases here to 5608 // simplify the logic. 5609 if (!isCandidateForEpilogueVectorization(MainLoopVF)) { 5610 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop " 5611 "is not a supported candidate.\n"); 5612 return Result; 5613 } 5614 5615 if (EpilogueVectorizationForceVF > 1) { 5616 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n"); 5617 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); 5618 if (hasPlanWithVF(ForcedEC)) 5619 return {ForcedEC, 0, 0}; 5620 else { 5621 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not " 5622 "viable.\n"); 5623 return Result; 5624 } 5625 } 5626 5627 if (OrigLoop->getHeader()->getParent()->hasOptSize() || 5628 OrigLoop->getHeader()->getParent()->hasMinSize()) { 5629 LLVM_DEBUG( 5630 dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n"); 5631 return Result; 5632 } 5633 5634 if (!CM.isEpilogueVectorizationProfitable(MainLoopVF)) { 5635 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " 5636 "this loop\n"); 5637 return Result; 5638 } 5639 5640 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know 5641 // the main loop handles 8 lanes per iteration. We could still benefit from 5642 // vectorizing the epilogue loop with VF=4. 5643 ElementCount EstimatedRuntimeVF = MainLoopVF; 5644 if (MainLoopVF.isScalable()) { 5645 EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); 5646 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) 5647 EstimatedRuntimeVF *= *VScale; 5648 } 5649 5650 ScalarEvolution &SE = *PSE.getSE(); 5651 Type *TCType = Legal->getWidestInductionType(); 5652 const SCEV *RemainingIterations = nullptr; 5653 for (auto &NextVF : ProfitableVFs) { 5654 // Skip candidate VFs without a corresponding VPlan. 5655 if (!hasPlanWithVF(NextVF.Width)) 5656 continue; 5657 5658 // Skip candidate VFs with widths >= the estimate runtime VF (scalable 5659 // vectors) or the VF of the main loop (fixed vectors). 5660 if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && 5661 ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) || 5662 ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) 5663 continue; 5664 5665 // If NextVF is greater than the number of remaining iterations, the 5666 // epilogue loop would be dead. Skip such factors. 5667 if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) { 5668 // TODO: extend to support scalable VFs. 5669 if (!RemainingIterations) { 5670 const SCEV *TC = createTripCountSCEV(TCType, PSE, OrigLoop); 5671 RemainingIterations = SE.getURemExpr( 5672 TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC)); 5673 } 5674 if (SE.isKnownPredicate( 5675 CmpInst::ICMP_UGT, 5676 SE.getConstant(TCType, NextVF.Width.getKnownMinValue()), 5677 RemainingIterations)) 5678 continue; 5679 } 5680 5681 if (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) 5682 Result = NextVF; 5683 } 5684 5685 if (Result != VectorizationFactor::Disabled()) 5686 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 5687 << Result.Width << "\n"); 5688 return Result; 5689 } 5690 5691 std::pair<unsigned, unsigned> 5692 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5693 unsigned MinWidth = -1U; 5694 unsigned MaxWidth = 8; 5695 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5696 // For in-loop reductions, no element types are added to ElementTypesInLoop 5697 // if there are no loads/stores in the loop. In this case, check through the 5698 // reduction variables to determine the maximum width. 5699 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) { 5700 // Reset MaxWidth so that we can find the smallest type used by recurrences 5701 // in the loop. 5702 MaxWidth = -1U; 5703 for (const auto &PhiDescriptorPair : Legal->getReductionVars()) { 5704 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second; 5705 // When finding the min width used by the recurrence we need to account 5706 // for casts on the input operands of the recurrence. 5707 MaxWidth = std::min<unsigned>( 5708 MaxWidth, std::min<unsigned>( 5709 RdxDesc.getMinWidthCastToRecurrenceTypeInBits(), 5710 RdxDesc.getRecurrenceType()->getScalarSizeInBits())); 5711 } 5712 } else { 5713 for (Type *T : ElementTypesInLoop) { 5714 MinWidth = std::min<unsigned>( 5715 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue()); 5716 MaxWidth = std::max<unsigned>( 5717 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue()); 5718 } 5719 } 5720 return {MinWidth, MaxWidth}; 5721 } 5722 5723 void LoopVectorizationCostModel::collectElementTypesForWidening() { 5724 ElementTypesInLoop.clear(); 5725 // For each block. 5726 for (BasicBlock *BB : TheLoop->blocks()) { 5727 // For each instruction in the loop. 5728 for (Instruction &I : BB->instructionsWithoutDebug()) { 5729 Type *T = I.getType(); 5730 5731 // Skip ignored values. 5732 if (ValuesToIgnore.count(&I)) 5733 continue; 5734 5735 // Only examine Loads, Stores and PHINodes. 5736 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5737 continue; 5738 5739 // Examine PHI nodes that are reduction variables. Update the type to 5740 // account for the recurrence type. 5741 if (auto *PN = dyn_cast<PHINode>(&I)) { 5742 if (!Legal->isReductionVariable(PN)) 5743 continue; 5744 const RecurrenceDescriptor &RdxDesc = 5745 Legal->getReductionVars().find(PN)->second; 5746 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 5747 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 5748 RdxDesc.getRecurrenceType(), 5749 TargetTransformInfo::ReductionFlags())) 5750 continue; 5751 T = RdxDesc.getRecurrenceType(); 5752 } 5753 5754 // Examine the stored values. 5755 if (auto *ST = dyn_cast<StoreInst>(&I)) 5756 T = ST->getValueOperand()->getType(); 5757 5758 assert(T->isSized() && 5759 "Expected the load/store/recurrence type to be sized"); 5760 5761 ElementTypesInLoop.insert(T); 5762 } 5763 } 5764 } 5765 5766 unsigned 5767 LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 5768 InstructionCost LoopCost) { 5769 // -- The interleave heuristics -- 5770 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5771 // There are many micro-architectural considerations that we can't predict 5772 // at this level. For example, frontend pressure (on decode or fetch) due to 5773 // code size, or the number and capabilities of the execution ports. 5774 // 5775 // We use the following heuristics to select the interleave count: 5776 // 1. If the code has reductions, then we interleave to break the cross 5777 // iteration dependency. 5778 // 2. If the loop is really small, then we interleave to reduce the loop 5779 // overhead. 5780 // 3. We don't interleave if we think that we will spill registers to memory 5781 // due to the increased register pressure. 5782 5783 if (!isScalarEpilogueAllowed()) 5784 return 1; 5785 5786 // We used the distance for the interleave count. 5787 if (!Legal->isSafeForAnyVectorWidth()) 5788 return 1; 5789 5790 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5791 const bool HasReductions = !Legal->getReductionVars().empty(); 5792 // Do not interleave loops with a relatively small known or estimated trip 5793 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 5794 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 5795 // because with the above conditions interleaving can expose ILP and break 5796 // cross iteration dependences for reductions. 5797 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 5798 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 5799 return 1; 5800 5801 // If we did not calculate the cost for VF (because the user selected the VF) 5802 // then we calculate the cost of VF here. 5803 if (LoopCost == 0) { 5804 LoopCost = expectedCost(VF).first; 5805 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost"); 5806 5807 // Loop body is free and there is no need for interleaving. 5808 if (LoopCost == 0) 5809 return 1; 5810 } 5811 5812 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5813 // We divide by these constants so assume that we have at least one 5814 // instruction that uses at least one register. 5815 for (auto& pair : R.MaxLocalUsers) { 5816 pair.second = std::max(pair.second, 1U); 5817 } 5818 5819 // We calculate the interleave count using the following formula. 5820 // Subtract the number of loop invariants from the number of available 5821 // registers. These registers are used by all of the interleaved instances. 5822 // Next, divide the remaining registers by the number of registers that is 5823 // required by the loop, in order to estimate how many parallel instances 5824 // fit without causing spills. All of this is rounded down if necessary to be 5825 // a power of two. We want power of two interleave count to simplify any 5826 // addressing operations or alignment considerations. 5827 // We also want power of two interleave counts to ensure that the induction 5828 // variable of the vector loop wraps to zero, when tail is folded by masking; 5829 // this currently happens when OptForSize, in which case IC is set to 1 above. 5830 unsigned IC = UINT_MAX; 5831 5832 for (auto& pair : R.MaxLocalUsers) { 5833 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5834 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5835 << " registers of " 5836 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5837 if (VF.isScalar()) { 5838 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5839 TargetNumRegisters = ForceTargetNumScalarRegs; 5840 } else { 5841 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5842 TargetNumRegisters = ForceTargetNumVectorRegs; 5843 } 5844 unsigned MaxLocalUsers = pair.second; 5845 unsigned LoopInvariantRegs = 0; 5846 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 5847 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 5848 5849 unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) / 5850 MaxLocalUsers); 5851 // Don't count the induction variable as interleaved. 5852 if (EnableIndVarRegisterHeur) { 5853 TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5854 std::max(1U, (MaxLocalUsers - 1))); 5855 } 5856 5857 IC = std::min(IC, TmpIC); 5858 } 5859 5860 // Clamp the interleave ranges to reasonable counts. 5861 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); 5862 5863 // Check if the user has overridden the max. 5864 if (VF.isScalar()) { 5865 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5866 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5867 } else { 5868 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5869 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5870 } 5871 5872 // If trip count is known or estimated compile time constant, limit the 5873 // interleave count to be less than the trip count divided by VF, provided it 5874 // is at least 1. 5875 // 5876 // For scalable vectors we can't know if interleaving is beneficial. It may 5877 // not be beneficial for small loops if none of the lanes in the second vector 5878 // iterations is enabled. However, for larger loops, there is likely to be a 5879 // similar benefit as for fixed-width vectors. For now, we choose to leave 5880 // the InterleaveCount as if vscale is '1', although if some information about 5881 // the vector is known (e.g. min vector size), we can make a better decision. 5882 if (BestKnownTC) { 5883 MaxInterleaveCount = 5884 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 5885 // Make sure MaxInterleaveCount is greater than 0. 5886 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 5887 } 5888 5889 assert(MaxInterleaveCount > 0 && 5890 "Maximum interleave count must be greater than 0"); 5891 5892 // Clamp the calculated IC to be between the 1 and the max interleave count 5893 // that the target and trip count allows. 5894 if (IC > MaxInterleaveCount) 5895 IC = MaxInterleaveCount; 5896 else 5897 // Make sure IC is greater than 0. 5898 IC = std::max(1u, IC); 5899 5900 assert(IC > 0 && "Interleave count must be greater than 0."); 5901 5902 // Interleave if we vectorized this loop and there is a reduction that could 5903 // benefit from interleaving. 5904 if (VF.isVector() && HasReductions) { 5905 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5906 return IC; 5907 } 5908 5909 // For any scalar loop that either requires runtime checks or predication we 5910 // are better off leaving this to the unroller. Note that if we've already 5911 // vectorized the loop we will have done the runtime check and so interleaving 5912 // won't require further checks. 5913 bool ScalarInterleavingRequiresPredication = 5914 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) { 5915 return Legal->blockNeedsPredication(BB); 5916 })); 5917 bool ScalarInterleavingRequiresRuntimePointerCheck = 5918 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 5919 5920 // We want to interleave small loops in order to reduce the loop overhead and 5921 // potentially expose ILP opportunities. 5922 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 5923 << "LV: IC is " << IC << '\n' 5924 << "LV: VF is " << VF << '\n'); 5925 const bool AggressivelyInterleaveReductions = 5926 TTI.enableAggressiveInterleaving(HasReductions); 5927 if (!ScalarInterleavingRequiresRuntimePointerCheck && 5928 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) { 5929 // We assume that the cost overhead is 1 and we use the cost model 5930 // to estimate the cost of the loop and interleave until the cost of the 5931 // loop overhead is about 5% of the cost of the loop. 5932 unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>( 5933 SmallLoopCost / *LoopCost.getValue())); 5934 5935 // Interleave until store/load ports (estimated by max interleave count) are 5936 // saturated. 5937 unsigned NumStores = Legal->getNumStores(); 5938 unsigned NumLoads = Legal->getNumLoads(); 5939 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5940 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5941 5942 // There is little point in interleaving for reductions containing selects 5943 // and compares when VF=1 since it may just create more overhead than it's 5944 // worth for loops with small trip counts. This is because we still have to 5945 // do the final reduction after the loop. 5946 bool HasSelectCmpReductions = 5947 HasReductions && 5948 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 5949 const RecurrenceDescriptor &RdxDesc = Reduction.second; 5950 return RecurrenceDescriptor::isSelectCmpRecurrenceKind( 5951 RdxDesc.getRecurrenceKind()); 5952 }); 5953 if (HasSelectCmpReductions) { 5954 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); 5955 return 1; 5956 } 5957 5958 // If we have a scalar reduction (vector reductions are already dealt with 5959 // by this point), we can increase the critical path length if the loop 5960 // we're interleaving is inside another loop. For tree-wise reductions 5961 // set the limit to 2, and for ordered reductions it's best to disable 5962 // interleaving entirely. 5963 if (HasReductions && TheLoop->getLoopDepth() > 1) { 5964 bool HasOrderedReductions = 5965 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 5966 const RecurrenceDescriptor &RdxDesc = Reduction.second; 5967 return RdxDesc.isOrdered(); 5968 }); 5969 if (HasOrderedReductions) { 5970 LLVM_DEBUG( 5971 dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); 5972 return 1; 5973 } 5974 5975 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5976 SmallIC = std::min(SmallIC, F); 5977 StoresIC = std::min(StoresIC, F); 5978 LoadsIC = std::min(LoadsIC, F); 5979 } 5980 5981 if (EnableLoadStoreRuntimeInterleave && 5982 std::max(StoresIC, LoadsIC) > SmallIC) { 5983 LLVM_DEBUG( 5984 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5985 return std::max(StoresIC, LoadsIC); 5986 } 5987 5988 // If there are scalar reductions and TTI has enabled aggressive 5989 // interleaving for reductions, we will interleave to expose ILP. 5990 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 5991 AggressivelyInterleaveReductions) { 5992 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5993 // Interleave no less than SmallIC but not as aggressive as the normal IC 5994 // to satisfy the rare situation when resources are too limited. 5995 return std::max(IC / 2, SmallIC); 5996 } else { 5997 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5998 return SmallIC; 5999 } 6000 } 6001 6002 // Interleave if this is a large loop (small loops are already dealt with by 6003 // this point) that could benefit from interleaving. 6004 if (AggressivelyInterleaveReductions) { 6005 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6006 return IC; 6007 } 6008 6009 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6010 return 1; 6011 } 6012 6013 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6014 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6015 // This function calculates the register usage by measuring the highest number 6016 // of values that are alive at a single location. Obviously, this is a very 6017 // rough estimation. We scan the loop in a topological order in order and 6018 // assign a number to each instruction. We use RPO to ensure that defs are 6019 // met before their users. We assume that each instruction that has in-loop 6020 // users starts an interval. We record every time that an in-loop value is 6021 // used, so we have a list of the first and last occurrences of each 6022 // instruction. Next, we transpose this data structure into a multi map that 6023 // holds the list of intervals that *end* at a specific location. This multi 6024 // map allows us to perform a linear search. We scan the instructions linearly 6025 // and record each time that a new interval starts, by placing it in a set. 6026 // If we find this value in the multi-map then we remove it from the set. 6027 // The max register usage is the maximum size of the set. 6028 // We also search for instructions that are defined outside the loop, but are 6029 // used inside the loop. We need this number separately from the max-interval 6030 // usage number because when we unroll, loop-invariant values do not take 6031 // more register. 6032 LoopBlocksDFS DFS(TheLoop); 6033 DFS.perform(LI); 6034 6035 RegisterUsage RU; 6036 6037 // Each 'key' in the map opens a new interval. The values 6038 // of the map are the index of the 'last seen' usage of the 6039 // instruction that is the key. 6040 using IntervalMap = DenseMap<Instruction *, unsigned>; 6041 6042 // Maps instruction to its index. 6043 SmallVector<Instruction *, 64> IdxToInstr; 6044 // Marks the end of each interval. 6045 IntervalMap EndPoint; 6046 // Saves the list of instruction indices that are used in the loop. 6047 SmallPtrSet<Instruction *, 8> Ends; 6048 // Saves the list of values that are used in the loop but are defined outside 6049 // the loop (not including non-instruction values such as arguments and 6050 // constants). 6051 SmallSetVector<Instruction *, 8> LoopInvariants; 6052 6053 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6054 for (Instruction &I : BB->instructionsWithoutDebug()) { 6055 IdxToInstr.push_back(&I); 6056 6057 // Save the end location of each USE. 6058 for (Value *U : I.operands()) { 6059 auto *Instr = dyn_cast<Instruction>(U); 6060 6061 // Ignore non-instruction values such as arguments, constants, etc. 6062 // FIXME: Might need some motivation why these values are ignored. If 6063 // for example an argument is used inside the loop it will increase the 6064 // register pressure (so shouldn't we add it to LoopInvariants). 6065 if (!Instr) 6066 continue; 6067 6068 // If this instruction is outside the loop then record it and continue. 6069 if (!TheLoop->contains(Instr)) { 6070 LoopInvariants.insert(Instr); 6071 continue; 6072 } 6073 6074 // Overwrite previous end points. 6075 EndPoint[Instr] = IdxToInstr.size(); 6076 Ends.insert(Instr); 6077 } 6078 } 6079 } 6080 6081 // Saves the list of intervals that end with the index in 'key'. 6082 using InstrList = SmallVector<Instruction *, 2>; 6083 DenseMap<unsigned, InstrList> TransposeEnds; 6084 6085 // Transpose the EndPoints to a list of values that end at each index. 6086 for (auto &Interval : EndPoint) 6087 TransposeEnds[Interval.second].push_back(Interval.first); 6088 6089 SmallPtrSet<Instruction *, 8> OpenIntervals; 6090 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6091 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6092 6093 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6094 6095 const auto &TTICapture = TTI; 6096 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { 6097 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6098 return 0; 6099 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); 6100 }; 6101 6102 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6103 Instruction *I = IdxToInstr[i]; 6104 6105 // Remove all of the instructions that end at this location. 6106 InstrList &List = TransposeEnds[i]; 6107 for (Instruction *ToRemove : List) 6108 OpenIntervals.erase(ToRemove); 6109 6110 // Ignore instructions that are never used within the loop. 6111 if (!Ends.count(I)) 6112 continue; 6113 6114 // Skip ignored values. 6115 if (ValuesToIgnore.count(I)) 6116 continue; 6117 6118 // For each VF find the maximum usage of registers. 6119 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6120 // Count the number of registers used, per register class, given all open 6121 // intervals. 6122 // Note that elements in this SmallMapVector will be default constructed 6123 // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if 6124 // there is no previous entry for ClassID. 6125 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6126 6127 if (VFs[j].isScalar()) { 6128 for (auto *Inst : OpenIntervals) { 6129 unsigned ClassID = 6130 TTI.getRegisterClassForType(false, Inst->getType()); 6131 // FIXME: The target might use more than one register for the type 6132 // even in the scalar case. 6133 RegUsage[ClassID] += 1; 6134 } 6135 } else { 6136 collectUniformsAndScalars(VFs[j]); 6137 for (auto *Inst : OpenIntervals) { 6138 // Skip ignored values for VF > 1. 6139 if (VecValuesToIgnore.count(Inst)) 6140 continue; 6141 if (isScalarAfterVectorization(Inst, VFs[j])) { 6142 unsigned ClassID = 6143 TTI.getRegisterClassForType(false, Inst->getType()); 6144 // FIXME: The target might use more than one register for the type 6145 // even in the scalar case. 6146 RegUsage[ClassID] += 1; 6147 } else { 6148 unsigned ClassID = 6149 TTI.getRegisterClassForType(true, Inst->getType()); 6150 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6151 } 6152 } 6153 } 6154 6155 for (auto& pair : RegUsage) { 6156 auto &Entry = MaxUsages[j][pair.first]; 6157 Entry = std::max(Entry, pair.second); 6158 } 6159 } 6160 6161 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6162 << OpenIntervals.size() << '\n'); 6163 6164 // Add the current instruction to the list of open intervals. 6165 OpenIntervals.insert(I); 6166 } 6167 6168 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6169 // Note that elements in this SmallMapVector will be default constructed 6170 // as 0. So we can use "Invariant[ClassID] += n" in the code below even if 6171 // there is no previous entry for ClassID. 6172 SmallMapVector<unsigned, unsigned, 4> Invariant; 6173 6174 for (auto *Inst : LoopInvariants) { 6175 // FIXME: The target might use more than one register for the type 6176 // even in the scalar case. 6177 bool IsScalar = all_of(Inst->users(), [&](User *U) { 6178 auto *I = cast<Instruction>(U); 6179 return TheLoop != LI->getLoopFor(I->getParent()) || 6180 isScalarAfterVectorization(I, VFs[i]); 6181 }); 6182 6183 ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[i]; 6184 unsigned ClassID = 6185 TTI.getRegisterClassForType(VF.isVector(), Inst->getType()); 6186 Invariant[ClassID] += GetRegUsage(Inst->getType(), VF); 6187 } 6188 6189 LLVM_DEBUG({ 6190 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6191 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6192 << " item\n"; 6193 for (const auto &pair : MaxUsages[i]) { 6194 dbgs() << "LV(REG): RegisterClass: " 6195 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6196 << " registers\n"; 6197 } 6198 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6199 << " item\n"; 6200 for (const auto &pair : Invariant) { 6201 dbgs() << "LV(REG): RegisterClass: " 6202 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6203 << " registers\n"; 6204 } 6205 }); 6206 6207 RU.LoopInvariantRegs = Invariant; 6208 RU.MaxLocalUsers = MaxUsages[i]; 6209 RUs[i] = RU; 6210 } 6211 6212 return RUs; 6213 } 6214 6215 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, 6216 ElementCount VF) { 6217 // TODO: Cost model for emulated masked load/store is completely 6218 // broken. This hack guides the cost model to use an artificially 6219 // high enough value to practically disable vectorization with such 6220 // operations, except where previously deployed legality hack allowed 6221 // using very low cost values. This is to avoid regressions coming simply 6222 // from moving "masked load/store" check from legality to cost model. 6223 // Masked Load/Gather emulation was previously never allowed. 6224 // Limited number of Masked Store/Scatter emulation was allowed. 6225 assert((isPredicatedInst(I)) && 6226 "Expecting a scalar emulated instruction"); 6227 return isa<LoadInst>(I) || 6228 (isa<StoreInst>(I) && 6229 NumPredStores > NumberOfStoresToPredicate); 6230 } 6231 6232 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6233 // If we aren't vectorizing the loop, or if we've already collected the 6234 // instructions to scalarize, there's nothing to do. Collection may already 6235 // have occurred if we have a user-selected VF and are now computing the 6236 // expected cost for interleaving. 6237 if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF)) 6238 return; 6239 6240 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6241 // not profitable to scalarize any instructions, the presence of VF in the 6242 // map will indicate that we've analyzed it already. 6243 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6244 6245 PredicatedBBsAfterVectorization[VF].clear(); 6246 6247 // Find all the instructions that are scalar with predication in the loop and 6248 // determine if it would be better to not if-convert the blocks they are in. 6249 // If so, we also record the instructions to scalarize. 6250 for (BasicBlock *BB : TheLoop->blocks()) { 6251 if (!blockNeedsPredicationForAnyReason(BB)) 6252 continue; 6253 for (Instruction &I : *BB) 6254 if (isScalarWithPredication(&I, VF)) { 6255 ScalarCostsTy ScalarCosts; 6256 // Do not apply discount if scalable, because that would lead to 6257 // invalid scalarization costs. 6258 // Do not apply discount logic if hacked cost is needed 6259 // for emulated masked memrefs. 6260 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) && 6261 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6262 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6263 // Remember that BB will remain after vectorization. 6264 PredicatedBBsAfterVectorization[VF].insert(BB); 6265 } 6266 } 6267 } 6268 6269 InstructionCost LoopVectorizationCostModel::computePredInstDiscount( 6270 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6271 assert(!isUniformAfterVectorization(PredInst, VF) && 6272 "Instruction marked uniform-after-vectorization will be predicated"); 6273 6274 // Initialize the discount to zero, meaning that the scalar version and the 6275 // vector version cost the same. 6276 InstructionCost Discount = 0; 6277 6278 // Holds instructions to analyze. The instructions we visit are mapped in 6279 // ScalarCosts. Those instructions are the ones that would be scalarized if 6280 // we find that the scalar version costs less. 6281 SmallVector<Instruction *, 8> Worklist; 6282 6283 // Returns true if the given instruction can be scalarized. 6284 auto canBeScalarized = [&](Instruction *I) -> bool { 6285 // We only attempt to scalarize instructions forming a single-use chain 6286 // from the original predicated block that would otherwise be vectorized. 6287 // Although not strictly necessary, we give up on instructions we know will 6288 // already be scalar to avoid traversing chains that are unlikely to be 6289 // beneficial. 6290 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6291 isScalarAfterVectorization(I, VF)) 6292 return false; 6293 6294 // If the instruction is scalar with predication, it will be analyzed 6295 // separately. We ignore it within the context of PredInst. 6296 if (isScalarWithPredication(I, VF)) 6297 return false; 6298 6299 // If any of the instruction's operands are uniform after vectorization, 6300 // the instruction cannot be scalarized. This prevents, for example, a 6301 // masked load from being scalarized. 6302 // 6303 // We assume we will only emit a value for lane zero of an instruction 6304 // marked uniform after vectorization, rather than VF identical values. 6305 // Thus, if we scalarize an instruction that uses a uniform, we would 6306 // create uses of values corresponding to the lanes we aren't emitting code 6307 // for. This behavior can be changed by allowing getScalarValue to clone 6308 // the lane zero values for uniforms rather than asserting. 6309 for (Use &U : I->operands()) 6310 if (auto *J = dyn_cast<Instruction>(U.get())) 6311 if (isUniformAfterVectorization(J, VF)) 6312 return false; 6313 6314 // Otherwise, we can scalarize the instruction. 6315 return true; 6316 }; 6317 6318 // Compute the expected cost discount from scalarizing the entire expression 6319 // feeding the predicated instruction. We currently only consider expressions 6320 // that are single-use instruction chains. 6321 Worklist.push_back(PredInst); 6322 while (!Worklist.empty()) { 6323 Instruction *I = Worklist.pop_back_val(); 6324 6325 // If we've already analyzed the instruction, there's nothing to do. 6326 if (ScalarCosts.contains(I)) 6327 continue; 6328 6329 // Compute the cost of the vector instruction. Note that this cost already 6330 // includes the scalarization overhead of the predicated instruction. 6331 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6332 6333 // Compute the cost of the scalarized instruction. This cost is the cost of 6334 // the instruction as if it wasn't if-converted and instead remained in the 6335 // predicated block. We will scale this cost by block probability after 6336 // computing the scalarization overhead. 6337 InstructionCost ScalarCost = 6338 VF.getFixedValue() * 6339 getInstructionCost(I, ElementCount::getFixed(1)).first; 6340 6341 // Compute the scalarization overhead of needed insertelement instructions 6342 // and phi nodes. 6343 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6344 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { 6345 ScalarCost += TTI.getScalarizationOverhead( 6346 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6347 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true, 6348 /*Extract*/ false, CostKind); 6349 ScalarCost += 6350 VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind); 6351 } 6352 6353 // Compute the scalarization overhead of needed extractelement 6354 // instructions. For each of the instruction's operands, if the operand can 6355 // be scalarized, add it to the worklist; otherwise, account for the 6356 // overhead. 6357 for (Use &U : I->operands()) 6358 if (auto *J = dyn_cast<Instruction>(U.get())) { 6359 assert(VectorType::isValidElementType(J->getType()) && 6360 "Instruction has non-scalar type"); 6361 if (canBeScalarized(J)) 6362 Worklist.push_back(J); 6363 else if (needsExtract(J, VF)) { 6364 ScalarCost += TTI.getScalarizationOverhead( 6365 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6366 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false, 6367 /*Extract*/ true, CostKind); 6368 } 6369 } 6370 6371 // Scale the total scalar cost by block probability. 6372 ScalarCost /= getReciprocalPredBlockProb(); 6373 6374 // Compute the discount. A non-negative discount means the vector version 6375 // of the instruction costs more, and scalarizing would be beneficial. 6376 Discount += VectorCost - ScalarCost; 6377 ScalarCosts[I] = ScalarCost; 6378 } 6379 6380 return Discount; 6381 } 6382 6383 LoopVectorizationCostModel::VectorizationCostTy 6384 LoopVectorizationCostModel::expectedCost( 6385 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { 6386 VectorizationCostTy Cost; 6387 6388 // For each block. 6389 for (BasicBlock *BB : TheLoop->blocks()) { 6390 VectorizationCostTy BlockCost; 6391 6392 // For each instruction in the old loop. 6393 for (Instruction &I : BB->instructionsWithoutDebug()) { 6394 // Skip ignored values. 6395 if (ValuesToIgnore.count(&I) || 6396 (VF.isVector() && VecValuesToIgnore.count(&I))) 6397 continue; 6398 6399 VectorizationCostTy C = getInstructionCost(&I, VF); 6400 6401 // Check if we should override the cost. 6402 if (C.first.isValid() && 6403 ForceTargetInstructionCost.getNumOccurrences() > 0) 6404 C.first = InstructionCost(ForceTargetInstructionCost); 6405 6406 // Keep a list of instructions with invalid costs. 6407 if (Invalid && !C.first.isValid()) 6408 Invalid->emplace_back(&I, VF); 6409 6410 BlockCost.first += C.first; 6411 BlockCost.second |= C.second; 6412 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6413 << " for VF " << VF << " For instruction: " << I 6414 << '\n'); 6415 } 6416 6417 // If we are vectorizing a predicated block, it will have been 6418 // if-converted. This means that the block's instructions (aside from 6419 // stores and instructions that may divide by zero) will now be 6420 // unconditionally executed. For the scalar case, we may not always execute 6421 // the predicated block, if it is an if-else block. Thus, scale the block's 6422 // cost by the probability of executing it. blockNeedsPredication from 6423 // Legal is used so as to not include all blocks in tail folded loops. 6424 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6425 BlockCost.first /= getReciprocalPredBlockProb(); 6426 6427 Cost.first += BlockCost.first; 6428 Cost.second |= BlockCost.second; 6429 } 6430 6431 return Cost; 6432 } 6433 6434 /// Gets Address Access SCEV after verifying that the access pattern 6435 /// is loop invariant except the induction variable dependence. 6436 /// 6437 /// This SCEV can be sent to the Target in order to estimate the address 6438 /// calculation cost. 6439 static const SCEV *getAddressAccessSCEV( 6440 Value *Ptr, 6441 LoopVectorizationLegality *Legal, 6442 PredicatedScalarEvolution &PSE, 6443 const Loop *TheLoop) { 6444 6445 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6446 if (!Gep) 6447 return nullptr; 6448 6449 // We are looking for a gep with all loop invariant indices except for one 6450 // which should be an induction variable. 6451 auto SE = PSE.getSE(); 6452 unsigned NumOperands = Gep->getNumOperands(); 6453 for (unsigned i = 1; i < NumOperands; ++i) { 6454 Value *Opd = Gep->getOperand(i); 6455 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6456 !Legal->isInductionVariable(Opd)) 6457 return nullptr; 6458 } 6459 6460 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6461 return PSE.getSCEV(Ptr); 6462 } 6463 6464 InstructionCost 6465 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6466 ElementCount VF) { 6467 assert(VF.isVector() && 6468 "Scalarization cost of instruction implies vectorization."); 6469 if (VF.isScalable()) 6470 return InstructionCost::getInvalid(); 6471 6472 Type *ValTy = getLoadStoreType(I); 6473 auto SE = PSE.getSE(); 6474 6475 unsigned AS = getLoadStoreAddressSpace(I); 6476 Value *Ptr = getLoadStorePointerOperand(I); 6477 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6478 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` 6479 // that it is being called from this specific place. 6480 6481 // Figure out whether the access is strided and get the stride value 6482 // if it's known in compile time 6483 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6484 6485 // Get the cost of the scalar memory instruction and address computation. 6486 InstructionCost Cost = 6487 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6488 6489 // Don't pass *I here, since it is scalar but will actually be part of a 6490 // vectorized loop where the user of it is a vectorized instruction. 6491 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6492 const Align Alignment = getLoadStoreAlignment(I); 6493 Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(), 6494 ValTy->getScalarType(), 6495 Alignment, AS, CostKind); 6496 6497 // Get the overhead of the extractelement and insertelement instructions 6498 // we might create due to scalarization. 6499 Cost += getScalarizationOverhead(I, VF, CostKind); 6500 6501 // If we have a predicated load/store, it will need extra i1 extracts and 6502 // conditional branches, but may not be executed for each vector lane. Scale 6503 // the cost by the probability of executing the predicated block. 6504 if (isPredicatedInst(I)) { 6505 Cost /= getReciprocalPredBlockProb(); 6506 6507 // Add the cost of an i1 extract and a branch 6508 auto *Vec_i1Ty = 6509 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6510 Cost += TTI.getScalarizationOverhead( 6511 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()), 6512 /*Insert=*/false, /*Extract=*/true, CostKind); 6513 Cost += TTI.getCFInstrCost(Instruction::Br, CostKind); 6514 6515 if (useEmulatedMaskMemRefHack(I, VF)) 6516 // Artificially setting to a high enough value to practically disable 6517 // vectorization with such operations. 6518 Cost = 3000000; 6519 } 6520 6521 return Cost; 6522 } 6523 6524 InstructionCost 6525 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6526 ElementCount VF) { 6527 Type *ValTy = getLoadStoreType(I); 6528 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6529 Value *Ptr = getLoadStorePointerOperand(I); 6530 unsigned AS = getLoadStoreAddressSpace(I); 6531 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); 6532 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6533 6534 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6535 "Stride should be 1 or -1 for consecutive memory access"); 6536 const Align Alignment = getLoadStoreAlignment(I); 6537 InstructionCost Cost = 0; 6538 if (Legal->isMaskRequired(I)) { 6539 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6540 CostKind); 6541 } else { 6542 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0)); 6543 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6544 CostKind, OpInfo, I); 6545 } 6546 6547 bool Reverse = ConsecutiveStride < 0; 6548 if (Reverse) 6549 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 6550 std::nullopt, CostKind, 0); 6551 return Cost; 6552 } 6553 6554 InstructionCost 6555 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6556 ElementCount VF) { 6557 assert(Legal->isUniformMemOp(*I, VF)); 6558 6559 Type *ValTy = getLoadStoreType(I); 6560 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6561 const Align Alignment = getLoadStoreAlignment(I); 6562 unsigned AS = getLoadStoreAddressSpace(I); 6563 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6564 if (isa<LoadInst>(I)) { 6565 return TTI.getAddressComputationCost(ValTy) + 6566 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6567 CostKind) + 6568 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6569 } 6570 StoreInst *SI = cast<StoreInst>(I); 6571 6572 bool isLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand()); 6573 return TTI.getAddressComputationCost(ValTy) + 6574 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6575 CostKind) + 6576 (isLoopInvariantStoreValue 6577 ? 0 6578 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6579 CostKind, VF.getKnownMinValue() - 1)); 6580 } 6581 6582 InstructionCost 6583 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6584 ElementCount VF) { 6585 Type *ValTy = getLoadStoreType(I); 6586 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6587 const Align Alignment = getLoadStoreAlignment(I); 6588 const Value *Ptr = getLoadStorePointerOperand(I); 6589 6590 return TTI.getAddressComputationCost(VectorTy) + 6591 TTI.getGatherScatterOpCost( 6592 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6593 TargetTransformInfo::TCK_RecipThroughput, I); 6594 } 6595 6596 InstructionCost 6597 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6598 ElementCount VF) { 6599 Type *ValTy = getLoadStoreType(I); 6600 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6601 unsigned AS = getLoadStoreAddressSpace(I); 6602 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6603 6604 auto Group = getInterleavedAccessGroup(I); 6605 assert(Group && "Fail to get an interleaved access group."); 6606 6607 unsigned InterleaveFactor = Group->getFactor(); 6608 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6609 6610 // Holds the indices of existing members in the interleaved group. 6611 SmallVector<unsigned, 4> Indices; 6612 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 6613 if (Group->getMember(IF)) 6614 Indices.push_back(IF); 6615 6616 // Calculate the cost of the whole interleaved group. 6617 bool UseMaskForGaps = 6618 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || 6619 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); 6620 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 6621 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6622 AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps); 6623 6624 if (Group->isReverse()) { 6625 // TODO: Add support for reversed masked interleaved access. 6626 assert(!Legal->isMaskRequired(I) && 6627 "Reverse masked interleaved access not supported."); 6628 Cost += Group->getNumMembers() * 6629 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 6630 std::nullopt, CostKind, 0); 6631 } 6632 return Cost; 6633 } 6634 6635 std::optional<InstructionCost> 6636 LoopVectorizationCostModel::getReductionPatternCost( 6637 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 6638 using namespace llvm::PatternMatch; 6639 // Early exit for no inloop reductions 6640 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 6641 return std::nullopt; 6642 auto *VectorTy = cast<VectorType>(Ty); 6643 6644 // We are looking for a pattern of, and finding the minimal acceptable cost: 6645 // reduce(mul(ext(A), ext(B))) or 6646 // reduce(mul(A, B)) or 6647 // reduce(ext(A)) or 6648 // reduce(A). 6649 // The basic idea is that we walk down the tree to do that, finding the root 6650 // reduction instruction in InLoopReductionImmediateChains. From there we find 6651 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 6652 // of the components. If the reduction cost is lower then we return it for the 6653 // reduction instruction and 0 for the other instructions in the pattern. If 6654 // it is not we return an invalid cost specifying the orignal cost method 6655 // should be used. 6656 Instruction *RetI = I; 6657 if (match(RetI, m_ZExtOrSExt(m_Value()))) { 6658 if (!RetI->hasOneUser()) 6659 return std::nullopt; 6660 RetI = RetI->user_back(); 6661 } 6662 6663 if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) && 6664 RetI->user_back()->getOpcode() == Instruction::Add) { 6665 RetI = RetI->user_back(); 6666 } 6667 6668 // Test if the found instruction is a reduction, and if not return an invalid 6669 // cost specifying the parent to use the original cost modelling. 6670 if (!InLoopReductionImmediateChains.count(RetI)) 6671 return std::nullopt; 6672 6673 // Find the reduction this chain is a part of and calculate the basic cost of 6674 // the reduction on its own. 6675 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 6676 Instruction *ReductionPhi = LastChain; 6677 while (!isa<PHINode>(ReductionPhi)) 6678 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 6679 6680 const RecurrenceDescriptor &RdxDesc = 6681 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second; 6682 6683 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 6684 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); 6685 6686 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a 6687 // normal fmul instruction to the cost of the fadd reduction. 6688 if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd) 6689 BaseCost += 6690 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind); 6691 6692 // If we're using ordered reductions then we can just return the base cost 6693 // here, since getArithmeticReductionCost calculates the full ordered 6694 // reduction cost when FP reassociation is not allowed. 6695 if (useOrderedReductions(RdxDesc)) 6696 return BaseCost; 6697 6698 // Get the operand that was not the reduction chain and match it to one of the 6699 // patterns, returning the better cost if it is found. 6700 Instruction *RedOp = RetI->getOperand(1) == LastChain 6701 ? dyn_cast<Instruction>(RetI->getOperand(0)) 6702 : dyn_cast<Instruction>(RetI->getOperand(1)); 6703 6704 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 6705 6706 Instruction *Op0, *Op1; 6707 if (RedOp && RdxDesc.getOpcode() == Instruction::Add && 6708 match(RedOp, 6709 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && 6710 match(Op0, m_ZExtOrSExt(m_Value())) && 6711 Op0->getOpcode() == Op1->getOpcode() && 6712 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 6713 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && 6714 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { 6715 6716 // Matched reduce.add(ext(mul(ext(A), ext(B))) 6717 // Note that the extend opcodes need to all match, or if A==B they will have 6718 // been converted to zext(mul(sext(A), sext(A))) as it is known positive, 6719 // which is equally fine. 6720 bool IsUnsigned = isa<ZExtInst>(Op0); 6721 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 6722 auto *MulType = VectorType::get(Op0->getType(), VectorTy); 6723 6724 InstructionCost ExtCost = 6725 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, 6726 TTI::CastContextHint::None, CostKind, Op0); 6727 InstructionCost MulCost = 6728 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); 6729 InstructionCost Ext2Cost = 6730 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, 6731 TTI::CastContextHint::None, CostKind, RedOp); 6732 6733 InstructionCost RedCost = TTI.getMulAccReductionCost( 6734 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind); 6735 6736 if (RedCost.isValid() && 6737 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) 6738 return I == RetI ? RedCost : 0; 6739 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && 6740 !TheLoop->isLoopInvariant(RedOp)) { 6741 // Matched reduce(ext(A)) 6742 bool IsUnsigned = isa<ZExtInst>(RedOp); 6743 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 6744 InstructionCost RedCost = TTI.getExtendedReductionCost( 6745 RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6746 RdxDesc.getFastMathFlags(), CostKind); 6747 6748 InstructionCost ExtCost = 6749 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 6750 TTI::CastContextHint::None, CostKind, RedOp); 6751 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 6752 return I == RetI ? RedCost : 0; 6753 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add && 6754 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { 6755 if (match(Op0, m_ZExtOrSExt(m_Value())) && 6756 Op0->getOpcode() == Op1->getOpcode() && 6757 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 6758 bool IsUnsigned = isa<ZExtInst>(Op0); 6759 Type *Op0Ty = Op0->getOperand(0)->getType(); 6760 Type *Op1Ty = Op1->getOperand(0)->getType(); 6761 Type *LargestOpTy = 6762 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty 6763 : Op0Ty; 6764 auto *ExtType = VectorType::get(LargestOpTy, VectorTy); 6765 6766 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of 6767 // different sizes. We take the largest type as the ext to reduce, and add 6768 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))). 6769 InstructionCost ExtCost0 = TTI.getCastInstrCost( 6770 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy), 6771 TTI::CastContextHint::None, CostKind, Op0); 6772 InstructionCost ExtCost1 = TTI.getCastInstrCost( 6773 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy), 6774 TTI::CastContextHint::None, CostKind, Op1); 6775 InstructionCost MulCost = 6776 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6777 6778 InstructionCost RedCost = TTI.getMulAccReductionCost( 6779 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind); 6780 InstructionCost ExtraExtCost = 0; 6781 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { 6782 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1; 6783 ExtraExtCost = TTI.getCastInstrCost( 6784 ExtraExtOp->getOpcode(), ExtType, 6785 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy), 6786 TTI::CastContextHint::None, CostKind, ExtraExtOp); 6787 } 6788 6789 if (RedCost.isValid() && 6790 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost)) 6791 return I == RetI ? RedCost : 0; 6792 } else if (!match(I, m_ZExtOrSExt(m_Value()))) { 6793 // Matched reduce.add(mul()) 6794 InstructionCost MulCost = 6795 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6796 6797 InstructionCost RedCost = TTI.getMulAccReductionCost( 6798 true, RdxDesc.getRecurrenceType(), VectorTy, CostKind); 6799 6800 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 6801 return I == RetI ? RedCost : 0; 6802 } 6803 } 6804 6805 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt; 6806 } 6807 6808 InstructionCost 6809 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 6810 ElementCount VF) { 6811 // Calculate scalar cost only. Vectorization cost should be ready at this 6812 // moment. 6813 if (VF.isScalar()) { 6814 Type *ValTy = getLoadStoreType(I); 6815 const Align Alignment = getLoadStoreAlignment(I); 6816 unsigned AS = getLoadStoreAddressSpace(I); 6817 6818 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0)); 6819 return TTI.getAddressComputationCost(ValTy) + 6820 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 6821 TTI::TCK_RecipThroughput, OpInfo, I); 6822 } 6823 return getWideningCost(I, VF); 6824 } 6825 6826 LoopVectorizationCostModel::VectorizationCostTy 6827 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6828 ElementCount VF) { 6829 // If we know that this instruction will remain uniform, check the cost of 6830 // the scalar version. 6831 if (isUniformAfterVectorization(I, VF)) 6832 VF = ElementCount::getFixed(1); 6833 6834 if (VF.isVector() && isProfitableToScalarize(I, VF)) 6835 return VectorizationCostTy(InstsToScalarize[VF][I], false); 6836 6837 // Forced scalars do not have any scalarization overhead. 6838 auto ForcedScalar = ForcedScalars.find(VF); 6839 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 6840 auto InstSet = ForcedScalar->second; 6841 if (InstSet.count(I)) 6842 return VectorizationCostTy( 6843 (getInstructionCost(I, ElementCount::getFixed(1)).first * 6844 VF.getKnownMinValue()), 6845 false); 6846 } 6847 6848 Type *VectorTy; 6849 InstructionCost C = getInstructionCost(I, VF, VectorTy); 6850 6851 bool TypeNotScalarized = false; 6852 if (VF.isVector() && VectorTy->isVectorTy()) { 6853 if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) { 6854 if (VF.isScalable()) 6855 // <vscale x 1 x iN> is assumed to be profitable over iN because 6856 // scalable registers are a distinct register class from scalar ones. 6857 // If we ever find a target which wants to lower scalable vectors 6858 // back to scalars, we'll need to update this code to explicitly 6859 // ask TTI about the register class uses for each part. 6860 TypeNotScalarized = NumParts <= VF.getKnownMinValue(); 6861 else 6862 TypeNotScalarized = NumParts < VF.getKnownMinValue(); 6863 } else 6864 C = InstructionCost::getInvalid(); 6865 } 6866 return VectorizationCostTy(C, TypeNotScalarized); 6867 } 6868 6869 InstructionCost LoopVectorizationCostModel::getScalarizationOverhead( 6870 Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const { 6871 6872 // There is no mechanism yet to create a scalable scalarization loop, 6873 // so this is currently Invalid. 6874 if (VF.isScalable()) 6875 return InstructionCost::getInvalid(); 6876 6877 if (VF.isScalar()) 6878 return 0; 6879 6880 InstructionCost Cost = 0; 6881 Type *RetTy = ToVectorTy(I->getType(), VF); 6882 if (!RetTy->isVoidTy() && 6883 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6884 Cost += TTI.getScalarizationOverhead( 6885 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), 6886 /*Insert*/ true, 6887 /*Extract*/ false, CostKind); 6888 6889 // Some targets keep addresses scalar. 6890 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6891 return Cost; 6892 6893 // Some targets support efficient element stores. 6894 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6895 return Cost; 6896 6897 // Collect operands to consider. 6898 CallInst *CI = dyn_cast<CallInst>(I); 6899 Instruction::op_range Ops = CI ? CI->args() : I->operands(); 6900 6901 // Skip operands that do not require extraction/scalarization and do not incur 6902 // any overhead. 6903 SmallVector<Type *> Tys; 6904 for (auto *V : filterExtractingOperands(Ops, VF)) 6905 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 6906 return Cost + TTI.getOperandsScalarizationOverhead( 6907 filterExtractingOperands(Ops, VF), Tys, CostKind); 6908 } 6909 6910 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 6911 if (VF.isScalar()) 6912 return; 6913 NumPredStores = 0; 6914 for (BasicBlock *BB : TheLoop->blocks()) { 6915 // For each instruction in the old loop. 6916 for (Instruction &I : *BB) { 6917 Value *Ptr = getLoadStorePointerOperand(&I); 6918 if (!Ptr) 6919 continue; 6920 6921 // TODO: We should generate better code and update the cost model for 6922 // predicated uniform stores. Today they are treated as any other 6923 // predicated store (see added test cases in 6924 // invariant-store-vectorization.ll). 6925 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF)) 6926 NumPredStores++; 6927 6928 if (Legal->isUniformMemOp(I, VF)) { 6929 auto isLegalToScalarize = [&]() { 6930 if (!VF.isScalable()) 6931 // Scalarization of fixed length vectors "just works". 6932 return true; 6933 6934 // We have dedicated lowering for unpredicated uniform loads and 6935 // stores. Note that even with tail folding we know that at least 6936 // one lane is active (i.e. generalized predication is not possible 6937 // here), and the logic below depends on this fact. 6938 if (!foldTailByMasking()) 6939 return true; 6940 6941 // For scalable vectors, a uniform memop load is always 6942 // uniform-by-parts and we know how to scalarize that. 6943 if (isa<LoadInst>(I)) 6944 return true; 6945 6946 // A uniform store isn't neccessarily uniform-by-part 6947 // and we can't assume scalarization. 6948 auto &SI = cast<StoreInst>(I); 6949 return TheLoop->isLoopInvariant(SI.getValueOperand()); 6950 }; 6951 6952 const InstructionCost GatherScatterCost = 6953 isLegalGatherOrScatter(&I, VF) ? 6954 getGatherScatterCost(&I, VF) : InstructionCost::getInvalid(); 6955 6956 // Load: Scalar load + broadcast 6957 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6958 // FIXME: This cost is a significant under-estimate for tail folded 6959 // memory ops. 6960 const InstructionCost ScalarizationCost = isLegalToScalarize() ? 6961 getUniformMemOpCost(&I, VF) : InstructionCost::getInvalid(); 6962 6963 // Choose better solution for the current VF, Note that Invalid 6964 // costs compare as maximumal large. If both are invalid, we get 6965 // scalable invalid which signals a failure and a vectorization abort. 6966 if (GatherScatterCost < ScalarizationCost) 6967 setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost); 6968 else 6969 setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost); 6970 continue; 6971 } 6972 6973 // We assume that widening is the best solution when possible. 6974 if (memoryInstructionCanBeWidened(&I, VF)) { 6975 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 6976 int ConsecutiveStride = Legal->isConsecutivePtr( 6977 getLoadStoreType(&I), getLoadStorePointerOperand(&I)); 6978 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6979 "Expected consecutive stride."); 6980 InstWidening Decision = 6981 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6982 setWideningDecision(&I, VF, Decision, Cost); 6983 continue; 6984 } 6985 6986 // Choose between Interleaving, Gather/Scatter or Scalarization. 6987 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 6988 unsigned NumAccesses = 1; 6989 if (isAccessInterleaved(&I)) { 6990 auto Group = getInterleavedAccessGroup(&I); 6991 assert(Group && "Fail to get an interleaved access group."); 6992 6993 // Make one decision for the whole group. 6994 if (getWideningDecision(&I, VF) != CM_Unknown) 6995 continue; 6996 6997 NumAccesses = Group->getNumMembers(); 6998 if (interleavedAccessCanBeWidened(&I, VF)) 6999 InterleaveCost = getInterleaveGroupCost(&I, VF); 7000 } 7001 7002 InstructionCost GatherScatterCost = 7003 isLegalGatherOrScatter(&I, VF) 7004 ? getGatherScatterCost(&I, VF) * NumAccesses 7005 : InstructionCost::getInvalid(); 7006 7007 InstructionCost ScalarizationCost = 7008 getMemInstScalarizationCost(&I, VF) * NumAccesses; 7009 7010 // Choose better solution for the current VF, 7011 // write down this decision and use it during vectorization. 7012 InstructionCost Cost; 7013 InstWidening Decision; 7014 if (InterleaveCost <= GatherScatterCost && 7015 InterleaveCost < ScalarizationCost) { 7016 Decision = CM_Interleave; 7017 Cost = InterleaveCost; 7018 } else if (GatherScatterCost < ScalarizationCost) { 7019 Decision = CM_GatherScatter; 7020 Cost = GatherScatterCost; 7021 } else { 7022 Decision = CM_Scalarize; 7023 Cost = ScalarizationCost; 7024 } 7025 // If the instructions belongs to an interleave group, the whole group 7026 // receives the same decision. The whole group receives the cost, but 7027 // the cost will actually be assigned to one instruction. 7028 if (auto Group = getInterleavedAccessGroup(&I)) 7029 setWideningDecision(Group, VF, Decision, Cost); 7030 else 7031 setWideningDecision(&I, VF, Decision, Cost); 7032 } 7033 } 7034 7035 // Make sure that any load of address and any other address computation 7036 // remains scalar unless there is gather/scatter support. This avoids 7037 // inevitable extracts into address registers, and also has the benefit of 7038 // activating LSR more, since that pass can't optimize vectorized 7039 // addresses. 7040 if (TTI.prefersVectorizedAddressing()) 7041 return; 7042 7043 // Start with all scalar pointer uses. 7044 SmallPtrSet<Instruction *, 8> AddrDefs; 7045 for (BasicBlock *BB : TheLoop->blocks()) 7046 for (Instruction &I : *BB) { 7047 Instruction *PtrDef = 7048 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7049 if (PtrDef && TheLoop->contains(PtrDef) && 7050 getWideningDecision(&I, VF) != CM_GatherScatter) 7051 AddrDefs.insert(PtrDef); 7052 } 7053 7054 // Add all instructions used to generate the addresses. 7055 SmallVector<Instruction *, 4> Worklist; 7056 append_range(Worklist, AddrDefs); 7057 while (!Worklist.empty()) { 7058 Instruction *I = Worklist.pop_back_val(); 7059 for (auto &Op : I->operands()) 7060 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7061 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7062 AddrDefs.insert(InstOp).second) 7063 Worklist.push_back(InstOp); 7064 } 7065 7066 for (auto *I : AddrDefs) { 7067 if (isa<LoadInst>(I)) { 7068 // Setting the desired widening decision should ideally be handled in 7069 // by cost functions, but since this involves the task of finding out 7070 // if the loaded register is involved in an address computation, it is 7071 // instead changed here when we know this is the case. 7072 InstWidening Decision = getWideningDecision(I, VF); 7073 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7074 // Scalarize a widened load of address. 7075 setWideningDecision( 7076 I, VF, CM_Scalarize, 7077 (VF.getKnownMinValue() * 7078 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7079 else if (auto Group = getInterleavedAccessGroup(I)) { 7080 // Scalarize an interleave group of address loads. 7081 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7082 if (Instruction *Member = Group->getMember(I)) 7083 setWideningDecision( 7084 Member, VF, CM_Scalarize, 7085 (VF.getKnownMinValue() * 7086 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7087 } 7088 } 7089 } else 7090 // Make sure I gets scalarized and a cost estimate without 7091 // scalarization overhead. 7092 ForcedScalars[VF].insert(I); 7093 } 7094 } 7095 7096 InstructionCost 7097 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7098 Type *&VectorTy) { 7099 Type *RetTy = I->getType(); 7100 if (canTruncateToMinimalBitwidth(I, VF)) 7101 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7102 auto SE = PSE.getSE(); 7103 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7104 7105 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 7106 ElementCount VF) -> bool { 7107 if (VF.isScalar()) 7108 return true; 7109 7110 auto Scalarized = InstsToScalarize.find(VF); 7111 assert(Scalarized != InstsToScalarize.end() && 7112 "VF not yet analyzed for scalarization profitability"); 7113 return !Scalarized->second.count(I) && 7114 llvm::all_of(I->users(), [&](User *U) { 7115 auto *UI = cast<Instruction>(U); 7116 return !Scalarized->second.count(UI); 7117 }); 7118 }; 7119 (void) hasSingleCopyAfterVectorization; 7120 7121 if (isScalarAfterVectorization(I, VF)) { 7122 // With the exception of GEPs and PHIs, after scalarization there should 7123 // only be one copy of the instruction generated in the loop. This is 7124 // because the VF is either 1, or any instructions that need scalarizing 7125 // have already been dealt with by the the time we get here. As a result, 7126 // it means we don't have to multiply the instruction cost by VF. 7127 assert(I->getOpcode() == Instruction::GetElementPtr || 7128 I->getOpcode() == Instruction::PHI || 7129 (I->getOpcode() == Instruction::BitCast && 7130 I->getType()->isPointerTy()) || 7131 hasSingleCopyAfterVectorization(I, VF)); 7132 VectorTy = RetTy; 7133 } else 7134 VectorTy = ToVectorTy(RetTy, VF); 7135 7136 // TODO: We need to estimate the cost of intrinsic calls. 7137 switch (I->getOpcode()) { 7138 case Instruction::GetElementPtr: 7139 // We mark this instruction as zero-cost because the cost of GEPs in 7140 // vectorized code depends on whether the corresponding memory instruction 7141 // is scalarized or not. Therefore, we handle GEPs with the memory 7142 // instruction cost. 7143 return 0; 7144 case Instruction::Br: { 7145 // In cases of scalarized and predicated instructions, there will be VF 7146 // predicated blocks in the vectorized loop. Each branch around these 7147 // blocks requires also an extract of its vector compare i1 element. 7148 bool ScalarPredicatedBB = false; 7149 BranchInst *BI = cast<BranchInst>(I); 7150 if (VF.isVector() && BI->isConditional() && 7151 (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) || 7152 PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1)))) 7153 ScalarPredicatedBB = true; 7154 7155 if (ScalarPredicatedBB) { 7156 // Not possible to scalarize scalable vector with predicated instructions. 7157 if (VF.isScalable()) 7158 return InstructionCost::getInvalid(); 7159 // Return cost for branches around scalarized and predicated blocks. 7160 auto *Vec_i1Ty = 7161 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7162 return ( 7163 TTI.getScalarizationOverhead( 7164 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), 7165 /*Insert*/ false, /*Extract*/ true, CostKind) + 7166 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); 7167 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7168 // The back-edge branch will remain, as will all scalar branches. 7169 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7170 else 7171 // This branch will be eliminated by if-conversion. 7172 return 0; 7173 // Note: We currently assume zero cost for an unconditional branch inside 7174 // a predicated block since it will become a fall-through, although we 7175 // may decide in the future to call TTI for all branches. 7176 } 7177 case Instruction::PHI: { 7178 auto *Phi = cast<PHINode>(I); 7179 7180 // First-order recurrences are replaced by vector shuffles inside the loop. 7181 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) { 7182 SmallVector<int> Mask(VF.getKnownMinValue()); 7183 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1); 7184 return TTI.getShuffleCost(TargetTransformInfo::SK_Splice, 7185 cast<VectorType>(VectorTy), Mask, CostKind, 7186 VF.getKnownMinValue() - 1); 7187 } 7188 7189 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7190 // converted into select instructions. We require N - 1 selects per phi 7191 // node, where N is the number of incoming values. 7192 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7193 return (Phi->getNumIncomingValues() - 1) * 7194 TTI.getCmpSelInstrCost( 7195 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7196 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7197 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7198 7199 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7200 } 7201 case Instruction::UDiv: 7202 case Instruction::SDiv: 7203 case Instruction::URem: 7204 case Instruction::SRem: 7205 if (VF.isVector() && isPredicatedInst(I)) { 7206 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF); 7207 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ? 7208 ScalarCost : SafeDivisorCost; 7209 } 7210 // We've proven all lanes safe to speculate, fall through. 7211 [[fallthrough]]; 7212 case Instruction::Add: 7213 case Instruction::FAdd: 7214 case Instruction::Sub: 7215 case Instruction::FSub: 7216 case Instruction::Mul: 7217 case Instruction::FMul: 7218 case Instruction::FDiv: 7219 case Instruction::FRem: 7220 case Instruction::Shl: 7221 case Instruction::LShr: 7222 case Instruction::AShr: 7223 case Instruction::And: 7224 case Instruction::Or: 7225 case Instruction::Xor: { 7226 // If we're speculating on the stride being 1, the multiplication may 7227 // fold away. We can generalize this for all operations using the notion 7228 // of neutral elements. (TODO) 7229 if (I->getOpcode() == Instruction::Mul && 7230 (PSE.getSCEV(I->getOperand(0))->isOne() || 7231 PSE.getSCEV(I->getOperand(1))->isOne())) 7232 return 0; 7233 7234 // Detect reduction patterns 7235 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7236 return *RedCost; 7237 7238 // Certain instructions can be cheaper to vectorize if they have a constant 7239 // second vector operand. One example of this are shifts on x86. 7240 Value *Op2 = I->getOperand(1); 7241 auto Op2Info = TTI.getOperandInfo(Op2); 7242 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && 7243 Legal->isInvariant(Op2)) 7244 Op2Info.Kind = TargetTransformInfo::OK_UniformValue; 7245 7246 SmallVector<const Value *, 4> Operands(I->operand_values()); 7247 return TTI.getArithmeticInstrCost( 7248 I->getOpcode(), VectorTy, CostKind, 7249 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 7250 Op2Info, Operands, I); 7251 } 7252 case Instruction::FNeg: { 7253 return TTI.getArithmeticInstrCost( 7254 I->getOpcode(), VectorTy, CostKind, 7255 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 7256 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 7257 I->getOperand(0), I); 7258 } 7259 case Instruction::Select: { 7260 SelectInst *SI = cast<SelectInst>(I); 7261 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7262 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7263 7264 const Value *Op0, *Op1; 7265 using namespace llvm::PatternMatch; 7266 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7267 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7268 // select x, y, false --> x & y 7269 // select x, true, y --> x | y 7270 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0); 7271 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1); 7272 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7273 Op1->getType()->getScalarSizeInBits() == 1); 7274 7275 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7276 return TTI.getArithmeticInstrCost( 7277 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7278 CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I); 7279 } 7280 7281 Type *CondTy = SI->getCondition()->getType(); 7282 if (!ScalarCond) 7283 CondTy = VectorType::get(CondTy, VF); 7284 7285 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 7286 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition())) 7287 Pred = Cmp->getPredicate(); 7288 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred, 7289 CostKind, I); 7290 } 7291 case Instruction::ICmp: 7292 case Instruction::FCmp: { 7293 Type *ValTy = I->getOperand(0)->getType(); 7294 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7295 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7296 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7297 VectorTy = ToVectorTy(ValTy, VF); 7298 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7299 cast<CmpInst>(I)->getPredicate(), CostKind, 7300 I); 7301 } 7302 case Instruction::Store: 7303 case Instruction::Load: { 7304 ElementCount Width = VF; 7305 if (Width.isVector()) { 7306 InstWidening Decision = getWideningDecision(I, Width); 7307 assert(Decision != CM_Unknown && 7308 "CM decision should be taken at this point"); 7309 if (getWideningCost(I, VF) == InstructionCost::getInvalid()) 7310 return InstructionCost::getInvalid(); 7311 if (Decision == CM_Scalarize) 7312 Width = ElementCount::getFixed(1); 7313 } 7314 VectorTy = ToVectorTy(getLoadStoreType(I), Width); 7315 return getMemoryInstructionCost(I, VF); 7316 } 7317 case Instruction::BitCast: 7318 if (I->getType()->isPointerTy()) 7319 return 0; 7320 [[fallthrough]]; 7321 case Instruction::ZExt: 7322 case Instruction::SExt: 7323 case Instruction::FPToUI: 7324 case Instruction::FPToSI: 7325 case Instruction::FPExt: 7326 case Instruction::PtrToInt: 7327 case Instruction::IntToPtr: 7328 case Instruction::SIToFP: 7329 case Instruction::UIToFP: 7330 case Instruction::Trunc: 7331 case Instruction::FPTrunc: { 7332 // Computes the CastContextHint from a Load/Store instruction. 7333 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7334 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7335 "Expected a load or a store!"); 7336 7337 if (VF.isScalar() || !TheLoop->contains(I)) 7338 return TTI::CastContextHint::Normal; 7339 7340 switch (getWideningDecision(I, VF)) { 7341 case LoopVectorizationCostModel::CM_GatherScatter: 7342 return TTI::CastContextHint::GatherScatter; 7343 case LoopVectorizationCostModel::CM_Interleave: 7344 return TTI::CastContextHint::Interleave; 7345 case LoopVectorizationCostModel::CM_Scalarize: 7346 case LoopVectorizationCostModel::CM_Widen: 7347 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7348 : TTI::CastContextHint::Normal; 7349 case LoopVectorizationCostModel::CM_Widen_Reverse: 7350 return TTI::CastContextHint::Reversed; 7351 case LoopVectorizationCostModel::CM_Unknown: 7352 llvm_unreachable("Instr did not go through cost modelling?"); 7353 } 7354 7355 llvm_unreachable("Unhandled case!"); 7356 }; 7357 7358 unsigned Opcode = I->getOpcode(); 7359 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7360 // For Trunc, the context is the only user, which must be a StoreInst. 7361 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7362 if (I->hasOneUse()) 7363 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7364 CCH = ComputeCCH(Store); 7365 } 7366 // For Z/Sext, the context is the operand, which must be a LoadInst. 7367 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7368 Opcode == Instruction::FPExt) { 7369 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7370 CCH = ComputeCCH(Load); 7371 } 7372 7373 // We optimize the truncation of induction variables having constant 7374 // integer steps. The cost of these truncations is the same as the scalar 7375 // operation. 7376 if (isOptimizableIVTruncate(I, VF)) { 7377 auto *Trunc = cast<TruncInst>(I); 7378 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7379 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7380 } 7381 7382 // Detect reduction patterns 7383 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7384 return *RedCost; 7385 7386 Type *SrcScalarTy = I->getOperand(0)->getType(); 7387 Type *SrcVecTy = 7388 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7389 if (canTruncateToMinimalBitwidth(I, VF)) { 7390 // This cast is going to be shrunk. This may remove the cast or it might 7391 // turn it into slightly different cast. For example, if MinBW == 16, 7392 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7393 // 7394 // Calculate the modified src and dest types. 7395 Type *MinVecTy = VectorTy; 7396 if (Opcode == Instruction::Trunc) { 7397 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7398 VectorTy = 7399 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7400 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7401 // Leave SrcVecTy unchanged - we only shrink the destination element 7402 // type. 7403 VectorTy = 7404 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7405 } 7406 } 7407 7408 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7409 } 7410 case Instruction::Call: { 7411 if (RecurrenceDescriptor::isFMulAddIntrinsic(I)) 7412 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7413 return *RedCost; 7414 Function *Variant; 7415 CallInst *CI = cast<CallInst>(I); 7416 InstructionCost CallCost = getVectorCallCost(CI, VF, &Variant); 7417 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7418 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7419 return std::min(CallCost, IntrinsicCost); 7420 } 7421 return CallCost; 7422 } 7423 case Instruction::ExtractValue: 7424 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7425 case Instruction::Alloca: 7426 // We cannot easily widen alloca to a scalable alloca, as 7427 // the result would need to be a vector of pointers. 7428 if (VF.isScalable()) 7429 return InstructionCost::getInvalid(); 7430 [[fallthrough]]; 7431 default: 7432 // This opcode is unknown. Assume that it is the same as 'mul'. 7433 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7434 } // end of switch. 7435 } 7436 7437 void LoopVectorizationCostModel::collectValuesToIgnore() { 7438 // Ignore ephemeral values. 7439 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7440 7441 // Find all stores to invariant variables. Since they are going to sink 7442 // outside the loop we do not need calculate cost for them. 7443 for (BasicBlock *BB : TheLoop->blocks()) 7444 for (Instruction &I : *BB) { 7445 StoreInst *SI; 7446 if ((SI = dyn_cast<StoreInst>(&I)) && 7447 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) 7448 ValuesToIgnore.insert(&I); 7449 } 7450 7451 // Ignore type-promoting instructions we identified during reduction 7452 // detection. 7453 for (const auto &Reduction : Legal->getReductionVars()) { 7454 const RecurrenceDescriptor &RedDes = Reduction.second; 7455 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7456 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7457 } 7458 // Ignore type-casting instructions we identified during induction 7459 // detection. 7460 for (const auto &Induction : Legal->getInductionVars()) { 7461 const InductionDescriptor &IndDes = Induction.second; 7462 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7463 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7464 } 7465 } 7466 7467 void LoopVectorizationCostModel::collectInLoopReductions() { 7468 for (const auto &Reduction : Legal->getReductionVars()) { 7469 PHINode *Phi = Reduction.first; 7470 const RecurrenceDescriptor &RdxDesc = Reduction.second; 7471 7472 // We don't collect reductions that are type promoted (yet). 7473 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7474 continue; 7475 7476 // If the target would prefer this reduction to happen "in-loop", then we 7477 // want to record it as such. 7478 unsigned Opcode = RdxDesc.getOpcode(); 7479 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7480 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7481 TargetTransformInfo::ReductionFlags())) 7482 continue; 7483 7484 // Check that we can correctly put the reductions into the loop, by 7485 // finding the chain of operations that leads from the phi to the loop 7486 // exit value. 7487 SmallVector<Instruction *, 4> ReductionOperations = 7488 RdxDesc.getReductionOpChain(Phi, TheLoop); 7489 bool InLoop = !ReductionOperations.empty(); 7490 if (InLoop) { 7491 InLoopReductionChains[Phi] = ReductionOperations; 7492 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7493 Instruction *LastChain = Phi; 7494 for (auto *I : ReductionOperations) { 7495 InLoopReductionImmediateChains[I] = LastChain; 7496 LastChain = I; 7497 } 7498 } 7499 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7500 << " reduction for phi: " << *Phi << "\n"); 7501 } 7502 } 7503 7504 // TODO: we could return a pair of values that specify the max VF and 7505 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7506 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7507 // doesn't have a cost model that can choose which plan to execute if 7508 // more than one is generated. 7509 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7510 LoopVectorizationCostModel &CM) { 7511 unsigned WidestType; 7512 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7513 return WidestVectorRegBits / WidestType; 7514 } 7515 7516 VectorizationFactor 7517 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7518 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7519 ElementCount VF = UserVF; 7520 // Outer loop handling: They may require CFG and instruction level 7521 // transformations before even evaluating whether vectorization is profitable. 7522 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7523 // the vectorization pipeline. 7524 if (!OrigLoop->isInnermost()) { 7525 // If the user doesn't provide a vectorization factor, determine a 7526 // reasonable one. 7527 if (UserVF.isZero()) { 7528 VF = ElementCount::getFixed(determineVPlanVF( 7529 TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 7530 .getFixedValue(), 7531 CM)); 7532 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7533 7534 // Make sure we have a VF > 1 for stress testing. 7535 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7536 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7537 << "overriding computed VF.\n"); 7538 VF = ElementCount::getFixed(4); 7539 } 7540 } 7541 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7542 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7543 "VF needs to be a power of two"); 7544 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7545 << "VF " << VF << " to build VPlans.\n"); 7546 buildVPlans(VF, VF); 7547 7548 // For VPlan build stress testing, we bail out after VPlan construction. 7549 if (VPlanBuildStressTest) 7550 return VectorizationFactor::Disabled(); 7551 7552 return {VF, 0 /*Cost*/, 0 /* ScalarCost */}; 7553 } 7554 7555 LLVM_DEBUG( 7556 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7557 "VPlan-native path.\n"); 7558 return VectorizationFactor::Disabled(); 7559 } 7560 7561 std::optional<VectorizationFactor> 7562 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7563 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7564 CM.collectValuesToIgnore(); 7565 CM.collectElementTypesForWidening(); 7566 7567 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 7568 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 7569 return std::nullopt; 7570 7571 // Invalidate interleave groups if all blocks of loop will be predicated. 7572 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && 7573 !useMaskedInterleavedAccesses(TTI)) { 7574 LLVM_DEBUG( 7575 dbgs() 7576 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7577 "which requires masked-interleaved support.\n"); 7578 if (CM.InterleaveInfo.invalidateGroups()) 7579 // Invalidating interleave groups also requires invalidating all decisions 7580 // based on them, which includes widening decisions and uniform and scalar 7581 // values. 7582 CM.invalidateCostModelingDecisions(); 7583 } 7584 7585 ElementCount MaxUserVF = 7586 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 7587 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); 7588 if (!UserVF.isZero() && UserVFIsLegal) { 7589 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 7590 "VF needs to be a power of two"); 7591 // Collect the instructions (and their associated costs) that will be more 7592 // profitable to scalarize. 7593 if (CM.selectUserVectorizationFactor(UserVF)) { 7594 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 7595 CM.collectInLoopReductions(); 7596 buildVPlansWithVPRecipes(UserVF, UserVF); 7597 if (!hasPlanWithVF(UserVF)) { 7598 LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF 7599 << ".\n"); 7600 return std::nullopt; 7601 } 7602 7603 LLVM_DEBUG(printPlans(dbgs())); 7604 return {{UserVF, 0, 0}}; 7605 } else 7606 reportVectorizationInfo("UserVF ignored because of invalid costs.", 7607 "InvalidCost", ORE, OrigLoop); 7608 } 7609 7610 // Populate the set of Vectorization Factor Candidates. 7611 ElementCountSet VFCandidates; 7612 for (auto VF = ElementCount::getFixed(1); 7613 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 7614 VFCandidates.insert(VF); 7615 for (auto VF = ElementCount::getScalable(1); 7616 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 7617 VFCandidates.insert(VF); 7618 7619 for (const auto &VF : VFCandidates) { 7620 // Collect Uniform and Scalar instructions after vectorization with VF. 7621 CM.collectUniformsAndScalars(VF); 7622 7623 // Collect the instructions (and their associated costs) that will be more 7624 // profitable to scalarize. 7625 if (VF.isVector()) 7626 CM.collectInstsToScalarize(VF); 7627 } 7628 7629 CM.collectInLoopReductions(); 7630 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 7631 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 7632 7633 LLVM_DEBUG(printPlans(dbgs())); 7634 if (!MaxFactors.hasVector()) 7635 return VectorizationFactor::Disabled(); 7636 7637 // Select the optimal vectorization factor. 7638 VectorizationFactor VF = selectVectorizationFactor(VFCandidates); 7639 assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero."); 7640 if (!hasPlanWithVF(VF.Width)) { 7641 LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width 7642 << ".\n"); 7643 return std::nullopt; 7644 } 7645 return VF; 7646 } 7647 7648 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { 7649 assert(count_if(VPlans, 7650 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 7651 1 && 7652 "Best VF has not a single VPlan."); 7653 7654 for (const VPlanPtr &Plan : VPlans) { 7655 if (Plan->hasVF(VF)) 7656 return *Plan.get(); 7657 } 7658 llvm_unreachable("No plan found!"); 7659 } 7660 7661 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7662 SmallVector<Metadata *, 4> MDs; 7663 // Reserve first location for self reference to the LoopID metadata node. 7664 MDs.push_back(nullptr); 7665 bool IsUnrollMetadata = false; 7666 MDNode *LoopID = L->getLoopID(); 7667 if (LoopID) { 7668 // First find existing loop unrolling disable metadata. 7669 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7670 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7671 if (MD) { 7672 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7673 IsUnrollMetadata = 7674 S && S->getString().startswith("llvm.loop.unroll.disable"); 7675 } 7676 MDs.push_back(LoopID->getOperand(i)); 7677 } 7678 } 7679 7680 if (!IsUnrollMetadata) { 7681 // Add runtime unroll disable metadata. 7682 LLVMContext &Context = L->getHeader()->getContext(); 7683 SmallVector<Metadata *, 1> DisableOperands; 7684 DisableOperands.push_back( 7685 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7686 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7687 MDs.push_back(DisableNode); 7688 MDNode *NewLoopID = MDNode::get(Context, MDs); 7689 // Set operand 0 to refer to the loop id itself. 7690 NewLoopID->replaceOperandWith(0, NewLoopID); 7691 L->setLoopID(NewLoopID); 7692 } 7693 } 7694 7695 SCEV2ValueTy LoopVectorizationPlanner::executePlan( 7696 ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan, 7697 InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization, 7698 DenseMap<const SCEV *, Value *> *ExpandedSCEVs) { 7699 assert(BestVPlan.hasVF(BestVF) && 7700 "Trying to execute plan with unsupported VF"); 7701 assert(BestVPlan.hasUF(BestUF) && 7702 "Trying to execute plan with unsupported UF"); 7703 assert( 7704 (IsEpilogueVectorization || !ExpandedSCEVs) && 7705 "expanded SCEVs to reuse can only be used during epilogue vectorization"); 7706 7707 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF 7708 << '\n'); 7709 7710 if (!IsEpilogueVectorization) 7711 VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE); 7712 7713 // Perform the actual loop transformation. 7714 VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; 7715 7716 // 0. Generate SCEV-dependent code into the preheader, including TripCount, 7717 // before making any changes to the CFG. 7718 if (!BestVPlan.getPreheader()->empty()) { 7719 State.CFG.PrevBB = OrigLoop->getLoopPreheader(); 7720 State.Builder.SetInsertPoint(OrigLoop->getLoopPreheader()->getTerminator()); 7721 BestVPlan.getPreheader()->execute(&State); 7722 } 7723 if (!ILV.getTripCount()) 7724 ILV.setTripCount(State.get(BestVPlan.getTripCount(), {0, 0})); 7725 else 7726 assert(IsEpilogueVectorization && "should only re-use the existing trip " 7727 "count during epilogue vectorization"); 7728 7729 // 1. Set up the skeleton for vectorization, including vector pre-header and 7730 // middle block. The vector loop is created during VPlan execution. 7731 Value *CanonicalIVStartValue; 7732 std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = 7733 ILV.createVectorizedLoopSkeleton(ExpandedSCEVs ? *ExpandedSCEVs 7734 : State.ExpandedSCEVs); 7735 7736 // Only use noalias metadata when using memory checks guaranteeing no overlap 7737 // across all iterations. 7738 const LoopAccessInfo *LAI = ILV.Legal->getLAI(); 7739 std::unique_ptr<LoopVersioning> LVer = nullptr; 7740 if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() && 7741 !LAI->getRuntimePointerChecking()->getDiffChecks()) { 7742 7743 // We currently don't use LoopVersioning for the actual loop cloning but we 7744 // still use it to add the noalias metadata. 7745 // TODO: Find a better way to re-use LoopVersioning functionality to add 7746 // metadata. 7747 LVer = std::make_unique<LoopVersioning>( 7748 *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT, 7749 PSE.getSE()); 7750 State.LVer = &*LVer; 7751 State.LVer->prepareNoAliasMetadata(); 7752 } 7753 7754 ILV.collectPoisonGeneratingRecipes(State); 7755 7756 ILV.printDebugTracesAtStart(); 7757 7758 //===------------------------------------------------===// 7759 // 7760 // Notice: any optimization or new instruction that go 7761 // into the code below should also be implemented in 7762 // the cost-model. 7763 // 7764 //===------------------------------------------------===// 7765 7766 // 2. Copy and widen instructions from the old loop into the new loop. 7767 BestVPlan.prepareToExecute( 7768 ILV.getTripCount(), ILV.getOrCreateVectorTripCount(nullptr), 7769 CanonicalIVStartValue, State, IsEpilogueVectorization); 7770 7771 BestVPlan.execute(&State); 7772 7773 // Keep all loop hints from the original loop on the vector loop (we'll 7774 // replace the vectorizer-specific hints below). 7775 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7776 7777 std::optional<MDNode *> VectorizedLoopID = 7778 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7779 LLVMLoopVectorizeFollowupVectorized}); 7780 7781 VPBasicBlock *HeaderVPBB = 7782 BestVPlan.getVectorLoopRegion()->getEntryBasicBlock(); 7783 Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]); 7784 if (VectorizedLoopID) 7785 L->setLoopID(*VectorizedLoopID); 7786 else { 7787 // Keep all loop hints from the original loop on the vector loop (we'll 7788 // replace the vectorizer-specific hints below). 7789 if (MDNode *LID = OrigLoop->getLoopID()) 7790 L->setLoopID(LID); 7791 7792 LoopVectorizeHints Hints(L, true, *ORE); 7793 Hints.setAlreadyVectorized(); 7794 } 7795 TargetTransformInfo::UnrollingPreferences UP; 7796 TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE); 7797 if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue) 7798 AddRuntimeUnrollDisableMetaData(L); 7799 7800 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7801 // predication, updating analyses. 7802 ILV.fixVectorizedLoop(State, BestVPlan); 7803 7804 ILV.printDebugTracesAtEnd(); 7805 7806 return State.ExpandedSCEVs; 7807 } 7808 7809 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 7810 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 7811 for (const auto &Plan : VPlans) 7812 if (PrintVPlansInDotFormat) 7813 Plan->printDOT(O); 7814 else 7815 Plan->print(O); 7816 } 7817 #endif 7818 7819 //===--------------------------------------------------------------------===// 7820 // EpilogueVectorizerMainLoop 7821 //===--------------------------------------------------------------------===// 7822 7823 /// This function is partially responsible for generating the control flow 7824 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7825 std::pair<BasicBlock *, Value *> 7826 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton( 7827 const SCEV2ValueTy &ExpandedSCEVs) { 7828 createVectorLoopSkeleton(""); 7829 7830 // Generate the code to check the minimum iteration count of the vector 7831 // epilogue (see below). 7832 EPI.EpilogueIterationCountCheck = 7833 emitIterationCountCheck(LoopScalarPreHeader, true); 7834 EPI.EpilogueIterationCountCheck->setName("iter.check"); 7835 7836 // Generate the code to check any assumptions that we've made for SCEV 7837 // expressions. 7838 EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader); 7839 7840 // Generate the code that checks at runtime if arrays overlap. We put the 7841 // checks into a separate block to make the more common case of few elements 7842 // faster. 7843 EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader); 7844 7845 // Generate the iteration count check for the main loop, *after* the check 7846 // for the epilogue loop, so that the path-length is shorter for the case 7847 // that goes directly through the vector epilogue. The longer-path length for 7848 // the main loop is compensated for, by the gain from vectorizing the larger 7849 // trip count. Note: the branch will get updated later on when we vectorize 7850 // the epilogue. 7851 EPI.MainLoopIterationCountCheck = 7852 emitIterationCountCheck(LoopScalarPreHeader, false); 7853 7854 // Generate the induction variable. 7855 EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 7856 7857 // Skip induction resume value creation here because they will be created in 7858 // the second pass for the scalar loop. The induction resume values for the 7859 // inductions in the epilogue loop are created before executing the plan for 7860 // the epilogue loop. 7861 7862 return {completeLoopSkeleton(), nullptr}; 7863 } 7864 7865 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 7866 LLVM_DEBUG({ 7867 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 7868 << "Main Loop VF:" << EPI.MainLoopVF 7869 << ", Main Loop UF:" << EPI.MainLoopUF 7870 << ", Epilogue Loop VF:" << EPI.EpilogueVF 7871 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7872 }); 7873 } 7874 7875 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 7876 DEBUG_WITH_TYPE(VerboseDebug, { 7877 dbgs() << "intermediate fn:\n" 7878 << *OrigLoop->getHeader()->getParent() << "\n"; 7879 }); 7880 } 7881 7882 BasicBlock * 7883 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, 7884 bool ForEpilogue) { 7885 assert(Bypass && "Expected valid bypass basic block."); 7886 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; 7887 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 7888 Value *Count = getTripCount(); 7889 // Reuse existing vector loop preheader for TC checks. 7890 // Note that new preheader block is generated for vector loop. 7891 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 7892 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 7893 7894 // Generate code to check if the loop's trip count is less than VF * UF of the 7895 // main vector loop. 7896 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector() 7897 : VF.isVector()) 7898 ? ICmpInst::ICMP_ULE 7899 : ICmpInst::ICMP_ULT; 7900 7901 Value *CheckMinIters = Builder.CreateICmp( 7902 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor), 7903 "min.iters.check"); 7904 7905 if (!ForEpilogue) 7906 TCCheckBlock->setName("vector.main.loop.iter.check"); 7907 7908 // Create new preheader for vector loop. 7909 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 7910 DT, LI, nullptr, "vector.ph"); 7911 7912 if (ForEpilogue) { 7913 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 7914 DT->getNode(Bypass)->getIDom()) && 7915 "TC check is expected to dominate Bypass"); 7916 7917 // Update dominator for Bypass & LoopExit. 7918 DT->changeImmediateDominator(Bypass, TCCheckBlock); 7919 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())) 7920 // For loops with multiple exits, there's no edge from the middle block 7921 // to exit blocks (as the epilogue must run) and thus no need to update 7922 // the immediate dominator of the exit blocks. 7923 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 7924 7925 LoopBypassBlocks.push_back(TCCheckBlock); 7926 7927 // Save the trip count so we don't have to regenerate it in the 7928 // vec.epilog.iter.check. This is safe to do because the trip count 7929 // generated here dominates the vector epilog iter check. 7930 EPI.TripCount = Count; 7931 } 7932 7933 ReplaceInstWithInst( 7934 TCCheckBlock->getTerminator(), 7935 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 7936 7937 return TCCheckBlock; 7938 } 7939 7940 //===--------------------------------------------------------------------===// 7941 // EpilogueVectorizerEpilogueLoop 7942 //===--------------------------------------------------------------------===// 7943 7944 /// This function is partially responsible for generating the control flow 7945 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7946 std::pair<BasicBlock *, Value *> 7947 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton( 7948 const SCEV2ValueTy &ExpandedSCEVs) { 7949 createVectorLoopSkeleton("vec.epilog."); 7950 7951 // Now, compare the remaining count and if there aren't enough iterations to 7952 // execute the vectorized epilogue skip to the scalar part. 7953 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 7954 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 7955 LoopVectorPreHeader = 7956 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 7957 LI, nullptr, "vec.epilog.ph"); 7958 emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader, 7959 VecEpilogueIterationCountCheck); 7960 7961 // Adjust the control flow taking the state info from the main loop 7962 // vectorization into account. 7963 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 7964 "expected this to be saved from the previous pass."); 7965 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 7966 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 7967 7968 DT->changeImmediateDominator(LoopVectorPreHeader, 7969 EPI.MainLoopIterationCountCheck); 7970 7971 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 7972 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7973 7974 if (EPI.SCEVSafetyCheck) 7975 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 7976 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7977 if (EPI.MemSafetyCheck) 7978 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 7979 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7980 7981 DT->changeImmediateDominator( 7982 VecEpilogueIterationCountCheck, 7983 VecEpilogueIterationCountCheck->getSinglePredecessor()); 7984 7985 DT->changeImmediateDominator(LoopScalarPreHeader, 7986 EPI.EpilogueIterationCountCheck); 7987 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())) 7988 // If there is an epilogue which must run, there's no edge from the 7989 // middle block to exit blocks and thus no need to update the immediate 7990 // dominator of the exit blocks. 7991 DT->changeImmediateDominator(LoopExitBlock, 7992 EPI.EpilogueIterationCountCheck); 7993 7994 // Keep track of bypass blocks, as they feed start values to the induction and 7995 // reduction phis in the scalar loop preheader. 7996 if (EPI.SCEVSafetyCheck) 7997 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 7998 if (EPI.MemSafetyCheck) 7999 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8000 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8001 8002 // The vec.epilog.iter.check block may contain Phi nodes from inductions or 8003 // reductions which merge control-flow from the latch block and the middle 8004 // block. Update the incoming values here and move the Phi into the preheader. 8005 SmallVector<PHINode *, 4> PhisInBlock; 8006 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis()) 8007 PhisInBlock.push_back(&Phi); 8008 8009 for (PHINode *Phi : PhisInBlock) { 8010 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI()); 8011 Phi->replaceIncomingBlockWith( 8012 VecEpilogueIterationCountCheck->getSinglePredecessor(), 8013 VecEpilogueIterationCountCheck); 8014 8015 // If the phi doesn't have an incoming value from the 8016 // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming 8017 // value and also those from other check blocks. This is needed for 8018 // reduction phis only. 8019 if (none_of(Phi->blocks(), [&](BasicBlock *IncB) { 8020 return EPI.EpilogueIterationCountCheck == IncB; 8021 })) 8022 continue; 8023 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck); 8024 if (EPI.SCEVSafetyCheck) 8025 Phi->removeIncomingValue(EPI.SCEVSafetyCheck); 8026 if (EPI.MemSafetyCheck) 8027 Phi->removeIncomingValue(EPI.MemSafetyCheck); 8028 } 8029 8030 // Generate a resume induction for the vector epilogue and put it in the 8031 // vector epilogue preheader 8032 Type *IdxTy = Legal->getWidestInductionType(); 8033 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 8034 LoopVectorPreHeader->getFirstNonPHI()); 8035 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 8036 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 8037 EPI.MainLoopIterationCountCheck); 8038 8039 // Generate induction resume values. These variables save the new starting 8040 // indexes for the scalar loop. They are used to test if there are any tail 8041 // iterations left once the vector loop has completed. 8042 // Note that when the vectorized epilogue is skipped due to iteration count 8043 // check, then the resume value for the induction variable comes from 8044 // the trip count of the main vector loop, hence passing the AdditionalBypass 8045 // argument. 8046 createInductionResumeValues(ExpandedSCEVs, 8047 {VecEpilogueIterationCountCheck, 8048 EPI.VectorTripCount} /* AdditionalBypass */); 8049 8050 return {completeLoopSkeleton(), EPResumeVal}; 8051 } 8052 8053 BasicBlock * 8054 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8055 BasicBlock *Bypass, BasicBlock *Insert) { 8056 8057 assert(EPI.TripCount && 8058 "Expected trip count to have been safed in the first pass."); 8059 assert( 8060 (!isa<Instruction>(EPI.TripCount) || 8061 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8062 "saved trip count does not dominate insertion point."); 8063 Value *TC = EPI.TripCount; 8064 IRBuilder<> Builder(Insert->getTerminator()); 8065 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8066 8067 // Generate code to check if the loop's trip count is less than VF * UF of the 8068 // vector epilogue loop. 8069 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()) 8070 ? ICmpInst::ICMP_ULE 8071 : ICmpInst::ICMP_ULT; 8072 8073 Value *CheckMinIters = 8074 Builder.CreateICmp(P, Count, 8075 createStepForVF(Builder, Count->getType(), 8076 EPI.EpilogueVF, EPI.EpilogueUF), 8077 "min.epilog.iters.check"); 8078 8079 ReplaceInstWithInst( 8080 Insert->getTerminator(), 8081 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8082 8083 LoopBypassBlocks.push_back(Insert); 8084 return Insert; 8085 } 8086 8087 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8088 LLVM_DEBUG({ 8089 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8090 << "Epilogue Loop VF:" << EPI.EpilogueVF 8091 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8092 }); 8093 } 8094 8095 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8096 DEBUG_WITH_TYPE(VerboseDebug, { 8097 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; 8098 }); 8099 } 8100 8101 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8102 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8103 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8104 bool PredicateAtRangeStart = Predicate(Range.Start); 8105 8106 for (ElementCount TmpVF : VFRange(Range.Start * 2, Range.End)) 8107 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8108 Range.End = TmpVF; 8109 break; 8110 } 8111 8112 return PredicateAtRangeStart; 8113 } 8114 8115 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8116 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8117 /// of VF's starting at a given VF and extending it as much as possible. Each 8118 /// vectorization decision can potentially shorten this sub-range during 8119 /// buildVPlan(). 8120 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8121 ElementCount MaxVF) { 8122 auto MaxVFTimes2 = MaxVF * 2; 8123 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) { 8124 VFRange SubRange = {VF, MaxVFTimes2}; 8125 VPlans.push_back(buildVPlan(SubRange)); 8126 VF = SubRange.End; 8127 } 8128 } 8129 8130 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8131 VPlan &Plan) { 8132 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8133 8134 // Look for cached value. 8135 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8136 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8137 if (ECEntryIt != EdgeMaskCache.end()) 8138 return ECEntryIt->second; 8139 8140 VPValue *SrcMask = createBlockInMask(Src, Plan); 8141 8142 // The terminator has to be a branch inst! 8143 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8144 assert(BI && "Unexpected terminator found"); 8145 8146 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8147 return EdgeMaskCache[Edge] = SrcMask; 8148 8149 // If source is an exiting block, we know the exit edge is dynamically dead 8150 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8151 // adding uses of an otherwise potentially dead instruction. 8152 if (OrigLoop->isLoopExiting(Src)) 8153 return EdgeMaskCache[Edge] = SrcMask; 8154 8155 VPValue *EdgeMask = Plan.getVPValueOrAddLiveIn(BI->getCondition()); 8156 assert(EdgeMask && "No Edge Mask found for condition"); 8157 8158 if (BI->getSuccessor(0) != Dst) 8159 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc()); 8160 8161 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8162 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8163 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8164 // The select version does not introduce new UB if SrcMask is false and 8165 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8166 VPValue *False = Plan.getVPValueOrAddLiveIn( 8167 ConstantInt::getFalse(BI->getCondition()->getType())); 8168 EdgeMask = 8169 Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc()); 8170 } 8171 8172 return EdgeMaskCache[Edge] = EdgeMask; 8173 } 8174 8175 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) { 8176 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8177 8178 // Look for cached value. 8179 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8180 if (BCEntryIt != BlockMaskCache.end()) 8181 return BCEntryIt->second; 8182 8183 // All-one mask is modelled as no-mask following the convention for masked 8184 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8185 VPValue *BlockMask = nullptr; 8186 8187 if (OrigLoop->getHeader() == BB) { 8188 if (!CM.blockNeedsPredicationForAnyReason(BB)) 8189 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8190 8191 assert(CM.foldTailByMasking() && "must fold the tail"); 8192 8193 // If we're using the active lane mask for control flow, then we get the 8194 // mask from the active lane mask PHI that is cached in the VPlan. 8195 TailFoldingStyle TFStyle = CM.getTailFoldingStyle(); 8196 if (useActiveLaneMaskForControlFlow(TFStyle)) 8197 return BlockMaskCache[BB] = Plan.getActiveLaneMaskPhi(); 8198 8199 // Introduce the early-exit compare IV <= BTC to form header block mask. 8200 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by 8201 // constructing the desired canonical IV in the header block as its first 8202 // non-phi instructions. 8203 8204 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); 8205 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); 8206 auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV()); 8207 HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi()); 8208 8209 VPBuilder::InsertPointGuard Guard(Builder); 8210 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); 8211 if (useActiveLaneMask(TFStyle)) { 8212 VPValue *TC = Plan.getTripCount(); 8213 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}, 8214 nullptr, "active.lane.mask"); 8215 } else { 8216 VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); 8217 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8218 } 8219 return BlockMaskCache[BB] = BlockMask; 8220 } 8221 8222 // This is the block mask. We OR all incoming edges. 8223 for (auto *Predecessor : predecessors(BB)) { 8224 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8225 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8226 return BlockMaskCache[BB] = EdgeMask; 8227 8228 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8229 BlockMask = EdgeMask; 8230 continue; 8231 } 8232 8233 BlockMask = Builder.createOr(BlockMask, EdgeMask, {}); 8234 } 8235 8236 return BlockMaskCache[BB] = BlockMask; 8237 } 8238 8239 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8240 ArrayRef<VPValue *> Operands, 8241 VFRange &Range, 8242 VPlanPtr &Plan) { 8243 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8244 "Must be called with either a load or store"); 8245 8246 auto willWiden = [&](ElementCount VF) -> bool { 8247 LoopVectorizationCostModel::InstWidening Decision = 8248 CM.getWideningDecision(I, VF); 8249 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8250 "CM decision should be taken at this point."); 8251 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8252 return true; 8253 if (CM.isScalarAfterVectorization(I, VF) || 8254 CM.isProfitableToScalarize(I, VF)) 8255 return false; 8256 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8257 }; 8258 8259 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8260 return nullptr; 8261 8262 VPValue *Mask = nullptr; 8263 if (Legal->isMaskRequired(I)) 8264 Mask = createBlockInMask(I->getParent(), *Plan); 8265 8266 // Determine if the pointer operand of the access is either consecutive or 8267 // reverse consecutive. 8268 LoopVectorizationCostModel::InstWidening Decision = 8269 CM.getWideningDecision(I, Range.Start); 8270 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; 8271 bool Consecutive = 8272 Reverse || Decision == LoopVectorizationCostModel::CM_Widen; 8273 8274 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8275 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask, 8276 Consecutive, Reverse); 8277 8278 StoreInst *Store = cast<StoreInst>(I); 8279 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8280 Mask, Consecutive, Reverse); 8281 } 8282 8283 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also 8284 /// insert a recipe to expand the step for the induction recipe. 8285 static VPWidenIntOrFpInductionRecipe * 8286 createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc, 8287 VPValue *Start, const InductionDescriptor &IndDesc, 8288 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, 8289 VFRange &Range) { 8290 assert(IndDesc.getStartValue() == 8291 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader())); 8292 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) && 8293 "step must be loop invariant"); 8294 8295 VPValue *Step = 8296 vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE); 8297 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) { 8298 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI); 8299 } 8300 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here"); 8301 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc); 8302 } 8303 8304 VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI( 8305 PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) { 8306 8307 // Check if this is an integer or fp induction. If so, build the recipe that 8308 // produces its scalar and vector values. 8309 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) 8310 return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan, 8311 *PSE.getSE(), *OrigLoop, Range); 8312 8313 // Check if this is pointer induction. If so, build the recipe for it. 8314 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) { 8315 VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(), 8316 *PSE.getSE()); 8317 return new VPWidenPointerInductionRecipe( 8318 Phi, Operands[0], Step, *II, 8319 LoopVectorizationPlanner::getDecisionAndClampRange( 8320 [&](ElementCount VF) { 8321 return CM.isScalarAfterVectorization(Phi, VF); 8322 }, 8323 Range)); 8324 } 8325 return nullptr; 8326 } 8327 8328 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8329 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) { 8330 // Optimize the special case where the source is a constant integer 8331 // induction variable. Notice that we can only optimize the 'trunc' case 8332 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8333 // (c) other casts depend on pointer size. 8334 8335 // Determine whether \p K is a truncation based on an induction variable that 8336 // can be optimized. 8337 auto isOptimizableIVTruncate = 8338 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8339 return [=](ElementCount VF) -> bool { 8340 return CM.isOptimizableIVTruncate(K, VF); 8341 }; 8342 }; 8343 8344 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8345 isOptimizableIVTruncate(I), Range)) { 8346 8347 auto *Phi = cast<PHINode>(I->getOperand(0)); 8348 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); 8349 VPValue *Start = Plan.getVPValueOrAddLiveIn(II.getStartValue()); 8350 return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(), 8351 *OrigLoop, Range); 8352 } 8353 return nullptr; 8354 } 8355 8356 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8357 ArrayRef<VPValue *> Operands, 8358 VPlanPtr &Plan) { 8359 // If all incoming values are equal, the incoming VPValue can be used directly 8360 // instead of creating a new VPBlendRecipe. 8361 if (llvm::all_equal(Operands)) 8362 return Operands[0]; 8363 8364 unsigned NumIncoming = Phi->getNumIncomingValues(); 8365 // For in-loop reductions, we do not need to create an additional select. 8366 VPValue *InLoopVal = nullptr; 8367 for (unsigned In = 0; In < NumIncoming; In++) { 8368 PHINode *PhiOp = 8369 dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue()); 8370 if (PhiOp && CM.isInLoopReduction(PhiOp)) { 8371 assert(!InLoopVal && "Found more than one in-loop reduction!"); 8372 InLoopVal = Operands[In]; 8373 } 8374 } 8375 8376 assert((!InLoopVal || NumIncoming == 2) && 8377 "Found an in-loop reduction for PHI with unexpected number of " 8378 "incoming values"); 8379 if (InLoopVal) 8380 return Operands[Operands[0] == InLoopVal ? 1 : 0]; 8381 8382 // We know that all PHIs in non-header blocks are converted into selects, so 8383 // we don't have to worry about the insertion order and we can just use the 8384 // builder. At this point we generate the predication tree. There may be 8385 // duplications since this is a simple recursive scan, but future 8386 // optimizations will clean it up. 8387 SmallVector<VPValue *, 2> OperandsWithMask; 8388 8389 for (unsigned In = 0; In < NumIncoming; In++) { 8390 VPValue *EdgeMask = 8391 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), *Plan); 8392 assert((EdgeMask || NumIncoming == 1) && 8393 "Multiple predecessors with one having a full mask"); 8394 OperandsWithMask.push_back(Operands[In]); 8395 if (EdgeMask) 8396 OperandsWithMask.push_back(EdgeMask); 8397 } 8398 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8399 } 8400 8401 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8402 ArrayRef<VPValue *> Operands, 8403 VFRange &Range, 8404 VPlanPtr &Plan) { 8405 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8406 [this, CI](ElementCount VF) { 8407 return CM.isScalarWithPredication(CI, VF); 8408 }, 8409 Range); 8410 8411 if (IsPredicated) 8412 return nullptr; 8413 8414 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8415 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8416 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8417 ID == Intrinsic::pseudoprobe || 8418 ID == Intrinsic::experimental_noalias_scope_decl)) 8419 return nullptr; 8420 8421 SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size())); 8422 8423 // Is it beneficial to perform intrinsic call compared to lib call? 8424 bool ShouldUseVectorIntrinsic = 8425 ID && LoopVectorizationPlanner::getDecisionAndClampRange( 8426 [&](ElementCount VF) -> bool { 8427 Function *Variant; 8428 // Is it beneficial to perform intrinsic call compared to lib 8429 // call? 8430 InstructionCost CallCost = 8431 CM.getVectorCallCost(CI, VF, &Variant); 8432 InstructionCost IntrinsicCost = 8433 CM.getVectorIntrinsicCost(CI, VF); 8434 return IntrinsicCost <= CallCost; 8435 }, 8436 Range); 8437 if (ShouldUseVectorIntrinsic) 8438 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID); 8439 8440 Function *Variant = nullptr; 8441 ElementCount VariantVF; 8442 bool NeedsMask = false; 8443 // Is better to call a vectorized version of the function than to to scalarize 8444 // the call? 8445 auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange( 8446 [&](ElementCount VF) -> bool { 8447 // The following case may be scalarized depending on the VF. 8448 // The flag shows whether we can use a usual Call for vectorized 8449 // version of the instruction. 8450 8451 // If we've found a variant at a previous VF, then stop looking. A 8452 // vectorized variant of a function expects input in a certain shape 8453 // -- basically the number of input registers, the number of lanes 8454 // per register, and whether there's a mask required. 8455 // We store a pointer to the variant in the VPWidenCallRecipe, so 8456 // once we have an appropriate variant it's only valid for that VF. 8457 // This will force a different vplan to be generated for each VF that 8458 // finds a valid variant. 8459 if (Variant) 8460 return false; 8461 CM.getVectorCallCost(CI, VF, &Variant, &NeedsMask); 8462 // If we found a valid vector variant at this VF, then store the VF 8463 // in case we need to generate a mask. 8464 if (Variant) 8465 VariantVF = VF; 8466 return Variant != nullptr; 8467 }, 8468 Range); 8469 if (ShouldUseVectorCall) { 8470 if (NeedsMask) { 8471 // We have 2 cases that would require a mask: 8472 // 1) The block needs to be predicated, either due to a conditional 8473 // in the scalar loop or use of an active lane mask with 8474 // tail-folding, and we use the appropriate mask for the block. 8475 // 2) No mask is required for the block, but the only available 8476 // vector variant at this VF requires a mask, so we synthesize an 8477 // all-true mask. 8478 VPValue *Mask = nullptr; 8479 if (Legal->isMaskRequired(CI)) 8480 Mask = createBlockInMask(CI->getParent(), *Plan); 8481 else 8482 Mask = Plan->getVPValueOrAddLiveIn(ConstantInt::getTrue( 8483 IntegerType::getInt1Ty(Variant->getFunctionType()->getContext()))); 8484 8485 VFShape Shape = VFShape::get(*CI, VariantVF, /*HasGlobalPred=*/true); 8486 unsigned MaskPos = 0; 8487 8488 for (const VFInfo &Info : VFDatabase::getMappings(*CI)) 8489 if (Info.Shape == Shape) { 8490 assert(Info.isMasked() && "Vector function info shape mismatch"); 8491 MaskPos = Info.getParamIndexForOptionalMask().value(); 8492 break; 8493 } 8494 8495 Ops.insert(Ops.begin() + MaskPos, Mask); 8496 } 8497 8498 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), 8499 Intrinsic::not_intrinsic, Variant); 8500 } 8501 8502 return nullptr; 8503 } 8504 8505 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8506 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8507 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8508 // Instruction should be widened, unless it is scalar after vectorization, 8509 // scalarization is profitable or it is predicated. 8510 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8511 return CM.isScalarAfterVectorization(I, VF) || 8512 CM.isProfitableToScalarize(I, VF) || 8513 CM.isScalarWithPredication(I, VF); 8514 }; 8515 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8516 Range); 8517 } 8518 8519 VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I, 8520 ArrayRef<VPValue *> Operands, 8521 VPBasicBlock *VPBB, VPlanPtr &Plan) { 8522 switch (I->getOpcode()) { 8523 default: 8524 return nullptr; 8525 case Instruction::SDiv: 8526 case Instruction::UDiv: 8527 case Instruction::SRem: 8528 case Instruction::URem: { 8529 // If not provably safe, use a select to form a safe divisor before widening the 8530 // div/rem operation itself. Otherwise fall through to general handling below. 8531 if (CM.isPredicatedInst(I)) { 8532 SmallVector<VPValue *> Ops(Operands.begin(), Operands.end()); 8533 VPValue *Mask = createBlockInMask(I->getParent(), *Plan); 8534 VPValue *One = Plan->getVPValueOrAddLiveIn( 8535 ConstantInt::get(I->getType(), 1u, false)); 8536 auto *SafeRHS = 8537 new VPInstruction(Instruction::Select, {Mask, Ops[1], One}, 8538 I->getDebugLoc()); 8539 VPBB->appendRecipe(SafeRHS); 8540 Ops[1] = SafeRHS; 8541 return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end())); 8542 } 8543 [[fallthrough]]; 8544 } 8545 case Instruction::Add: 8546 case Instruction::And: 8547 case Instruction::AShr: 8548 case Instruction::FAdd: 8549 case Instruction::FCmp: 8550 case Instruction::FDiv: 8551 case Instruction::FMul: 8552 case Instruction::FNeg: 8553 case Instruction::FRem: 8554 case Instruction::FSub: 8555 case Instruction::ICmp: 8556 case Instruction::LShr: 8557 case Instruction::Mul: 8558 case Instruction::Or: 8559 case Instruction::Select: 8560 case Instruction::Shl: 8561 case Instruction::Sub: 8562 case Instruction::Xor: 8563 case Instruction::Freeze: 8564 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8565 }; 8566 } 8567 8568 void VPRecipeBuilder::fixHeaderPhis() { 8569 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8570 for (VPHeaderPHIRecipe *R : PhisToFix) { 8571 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8572 VPRecipeBase *IncR = 8573 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8574 R->addOperand(IncR->getVPSingleValue()); 8575 } 8576 } 8577 8578 VPRecipeOrVPValueTy VPRecipeBuilder::handleReplication(Instruction *I, 8579 VFRange &Range, 8580 VPlan &Plan) { 8581 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8582 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8583 Range); 8584 8585 bool IsPredicated = CM.isPredicatedInst(I); 8586 8587 // Even if the instruction is not marked as uniform, there are certain 8588 // intrinsic calls that can be effectively treated as such, so we check for 8589 // them here. Conservatively, we only do this for scalable vectors, since 8590 // for fixed-width VFs we can always fall back on full scalarization. 8591 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { 8592 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { 8593 case Intrinsic::assume: 8594 case Intrinsic::lifetime_start: 8595 case Intrinsic::lifetime_end: 8596 // For scalable vectors if one of the operands is variant then we still 8597 // want to mark as uniform, which will generate one instruction for just 8598 // the first lane of the vector. We can't scalarize the call in the same 8599 // way as for fixed-width vectors because we don't know how many lanes 8600 // there are. 8601 // 8602 // The reasons for doing it this way for scalable vectors are: 8603 // 1. For the assume intrinsic generating the instruction for the first 8604 // lane is still be better than not generating any at all. For 8605 // example, the input may be a splat across all lanes. 8606 // 2. For the lifetime start/end intrinsics the pointer operand only 8607 // does anything useful when the input comes from a stack object, 8608 // which suggests it should always be uniform. For non-stack objects 8609 // the effect is to poison the object, which still allows us to 8610 // remove the call. 8611 IsUniform = true; 8612 break; 8613 default: 8614 break; 8615 } 8616 } 8617 VPValue *BlockInMask = nullptr; 8618 if (!IsPredicated) { 8619 // Finalize the recipe for Instr, first if it is not predicated. 8620 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8621 } else { 8622 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8623 // Instructions marked for predication are replicated and a mask operand is 8624 // added initially. Masked replicate recipes will later be placed under an 8625 // if-then construct to prevent side-effects. Generate recipes to compute 8626 // the block mask for this region. 8627 BlockInMask = createBlockInMask(I->getParent(), Plan); 8628 } 8629 8630 auto *Recipe = new VPReplicateRecipe(I, Plan.mapToVPValues(I->operands()), 8631 IsUniform, BlockInMask); 8632 return toVPRecipeResult(Recipe); 8633 } 8634 8635 VPRecipeOrVPValueTy 8636 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8637 ArrayRef<VPValue *> Operands, 8638 VFRange &Range, VPBasicBlock *VPBB, 8639 VPlanPtr &Plan) { 8640 // First, check for specific widening recipes that deal with inductions, Phi 8641 // nodes, calls and memory operations. 8642 VPRecipeBase *Recipe; 8643 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8644 if (Phi->getParent() != OrigLoop->getHeader()) 8645 return tryToBlend(Phi, Operands, Plan); 8646 8647 // Always record recipes for header phis. Later first-order recurrence phis 8648 // can have earlier phis as incoming values. 8649 recordRecipeOf(Phi); 8650 8651 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range))) 8652 return toVPRecipeResult(Recipe); 8653 8654 VPHeaderPHIRecipe *PhiRecipe = nullptr; 8655 assert((Legal->isReductionVariable(Phi) || 8656 Legal->isFixedOrderRecurrence(Phi)) && 8657 "can only widen reductions and fixed-order recurrences here"); 8658 VPValue *StartV = Operands[0]; 8659 if (Legal->isReductionVariable(Phi)) { 8660 const RecurrenceDescriptor &RdxDesc = 8661 Legal->getReductionVars().find(Phi)->second; 8662 assert(RdxDesc.getRecurrenceStartValue() == 8663 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8664 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, 8665 CM.isInLoopReduction(Phi), 8666 CM.useOrderedReductions(RdxDesc)); 8667 } else { 8668 // TODO: Currently fixed-order recurrences are modeled as chains of 8669 // first-order recurrences. If there are no users of the intermediate 8670 // recurrences in the chain, the fixed order recurrence should be modeled 8671 // directly, enabling more efficient codegen. 8672 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); 8673 } 8674 8675 // Record the incoming value from the backedge, so we can add the incoming 8676 // value from the backedge after all recipes have been created. 8677 auto *Inc = cast<Instruction>( 8678 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())); 8679 auto RecipeIter = Ingredient2Recipe.find(Inc); 8680 if (RecipeIter == Ingredient2Recipe.end()) 8681 recordRecipeOf(Inc); 8682 8683 PhisToFix.push_back(PhiRecipe); 8684 return toVPRecipeResult(PhiRecipe); 8685 } 8686 8687 if (isa<TruncInst>(Instr) && 8688 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 8689 Range, *Plan))) 8690 return toVPRecipeResult(Recipe); 8691 8692 // All widen recipes below deal only with VF > 1. 8693 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8694 [&](ElementCount VF) { return VF.isScalar(); }, Range)) 8695 return nullptr; 8696 8697 if (auto *CI = dyn_cast<CallInst>(Instr)) 8698 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range, Plan)); 8699 8700 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8701 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 8702 8703 if (!shouldWiden(Instr, Range)) 8704 return nullptr; 8705 8706 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8707 return toVPRecipeResult(new VPWidenGEPRecipe( 8708 GEP, make_range(Operands.begin(), Operands.end()))); 8709 8710 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8711 return toVPRecipeResult(new VPWidenSelectRecipe( 8712 *SI, make_range(Operands.begin(), Operands.end()))); 8713 } 8714 8715 if (auto *CI = dyn_cast<CastInst>(Instr)) { 8716 return toVPRecipeResult( 8717 new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(), CI)); 8718 } 8719 8720 return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan)); 8721 } 8722 8723 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8724 ElementCount MaxVF) { 8725 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8726 8727 // Add assume instructions we need to drop to DeadInstructions, to prevent 8728 // them from being added to the VPlan. 8729 // TODO: We only need to drop assumes in blocks that get flattend. If the 8730 // control flow is preserved, we should keep them. 8731 SmallPtrSet<Instruction *, 4> DeadInstructions; 8732 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8733 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8734 8735 auto MaxVFTimes2 = MaxVF * 2; 8736 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) { 8737 VFRange SubRange = {VF, MaxVFTimes2}; 8738 if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange, DeadInstructions)) 8739 VPlans.push_back(std::move(*Plan)); 8740 VF = SubRange.End; 8741 } 8742 } 8743 8744 // Add the necessary canonical IV and branch recipes required to control the 8745 // loop. 8746 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, 8747 TailFoldingStyle Style) { 8748 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8749 auto *StartV = Plan.getVPValueOrAddLiveIn(StartIdx); 8750 8751 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header. 8752 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); 8753 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); 8754 VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); 8755 Header->insert(CanonicalIVPHI, Header->begin()); 8756 8757 // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar 8758 // IV by VF * UF. 8759 bool HasNUW = Style == TailFoldingStyle::None; 8760 auto *CanonicalIVIncrement = 8761 new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW 8762 : VPInstruction::CanonicalIVIncrement, 8763 {CanonicalIVPHI}, DL, "index.next"); 8764 CanonicalIVPHI->addOperand(CanonicalIVIncrement); 8765 8766 VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); 8767 if (useActiveLaneMaskForControlFlow(Style)) { 8768 // Create the active lane mask instruction in the vplan preheader. 8769 VPBasicBlock *VecPreheader = 8770 cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSinglePredecessor()); 8771 8772 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since 8773 // we have to take unrolling into account. Each part needs to start at 8774 // Part * VF 8775 auto *CanonicalIVIncrementParts = 8776 new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW 8777 : VPInstruction::CanonicalIVIncrementForPart, 8778 {StartV}, DL, "index.part.next"); 8779 VecPreheader->appendRecipe(CanonicalIVIncrementParts); 8780 8781 // Create the ActiveLaneMask instruction using the correct start values. 8782 VPValue *TC = Plan.getTripCount(); 8783 8784 VPValue *TripCount, *IncrementValue; 8785 if (Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) { 8786 // When avoiding a runtime check, the active.lane.mask inside the loop 8787 // uses a modified trip count and the induction variable increment is 8788 // done after the active.lane.mask intrinsic is called. 8789 auto *TCMinusVF = 8790 new VPInstruction(VPInstruction::CalculateTripCountMinusVF, {TC}, DL); 8791 VecPreheader->appendRecipe(TCMinusVF); 8792 IncrementValue = CanonicalIVPHI; 8793 TripCount = TCMinusVF; 8794 } else { 8795 // When the loop is guarded by a runtime overflow check for the loop 8796 // induction variable increment by VF, we can increment the value before 8797 // the get.active.lane mask and use the unmodified tripcount. 8798 EB->appendRecipe(CanonicalIVIncrement); 8799 IncrementValue = CanonicalIVIncrement; 8800 TripCount = TC; 8801 } 8802 8803 auto *EntryALM = new VPInstruction(VPInstruction::ActiveLaneMask, 8804 {CanonicalIVIncrementParts, TC}, DL, 8805 "active.lane.mask.entry"); 8806 VecPreheader->appendRecipe(EntryALM); 8807 8808 // Now create the ActiveLaneMaskPhi recipe in the main loop using the 8809 // preheader ActiveLaneMask instruction. 8810 auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc()); 8811 Header->insert(LaneMaskPhi, Header->getFirstNonPhi()); 8812 8813 // Create the active lane mask for the next iteration of the loop. 8814 CanonicalIVIncrementParts = 8815 new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW 8816 : VPInstruction::CanonicalIVIncrementForPart, 8817 {IncrementValue}, DL); 8818 EB->appendRecipe(CanonicalIVIncrementParts); 8819 8820 auto *ALM = new VPInstruction(VPInstruction::ActiveLaneMask, 8821 {CanonicalIVIncrementParts, TripCount}, DL, 8822 "active.lane.mask.next"); 8823 EB->appendRecipe(ALM); 8824 LaneMaskPhi->addOperand(ALM); 8825 8826 if (Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) { 8827 // Do the increment of the canonical IV after the active.lane.mask, because 8828 // that value is still based off %CanonicalIVPHI 8829 EB->appendRecipe(CanonicalIVIncrement); 8830 } 8831 8832 // We have to invert the mask here because a true condition means jumping 8833 // to the exit block. 8834 auto *NotMask = new VPInstruction(VPInstruction::Not, ALM, DL); 8835 EB->appendRecipe(NotMask); 8836 8837 VPInstruction *BranchBack = 8838 new VPInstruction(VPInstruction::BranchOnCond, {NotMask}, DL); 8839 EB->appendRecipe(BranchBack); 8840 } else { 8841 EB->appendRecipe(CanonicalIVIncrement); 8842 8843 // Add the BranchOnCount VPInstruction to the latch. 8844 VPInstruction *BranchBack = new VPInstruction( 8845 VPInstruction::BranchOnCount, 8846 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); 8847 EB->appendRecipe(BranchBack); 8848 } 8849 } 8850 8851 // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the 8852 // original exit block. 8853 static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, 8854 VPBasicBlock *MiddleVPBB, Loop *OrigLoop, 8855 VPlan &Plan) { 8856 BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock(); 8857 BasicBlock *ExitingBB = OrigLoop->getExitingBlock(); 8858 // Only handle single-exit loops with unique exit blocks for now. 8859 if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB) 8860 return; 8861 8862 // Introduce VPUsers modeling the exit values. 8863 for (PHINode &ExitPhi : ExitBB->phis()) { 8864 Value *IncomingValue = 8865 ExitPhi.getIncomingValueForBlock(ExitingBB); 8866 VPValue *V = Plan.getVPValueOrAddLiveIn(IncomingValue); 8867 Plan.addLiveOut(&ExitPhi, V); 8868 } 8869 } 8870 8871 std::optional<VPlanPtr> LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( 8872 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions) { 8873 8874 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8875 8876 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8877 8878 // --------------------------------------------------------------------------- 8879 // Pre-construction: record ingredients whose recipes we'll need to further 8880 // process after constructing the initial VPlan. 8881 // --------------------------------------------------------------------------- 8882 8883 for (const auto &Reduction : CM.getInLoopReductionChains()) { 8884 PHINode *Phi = Reduction.first; 8885 RecurKind Kind = 8886 Legal->getReductionVars().find(Phi)->second.getRecurrenceKind(); 8887 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8888 8889 RecipeBuilder.recordRecipeOf(Phi); 8890 for (const auto &R : ReductionOperations) { 8891 RecipeBuilder.recordRecipeOf(R); 8892 // For min/max reductions, where we have a pair of icmp/select, we also 8893 // need to record the ICmp recipe, so it can be removed later. 8894 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 8895 "Only min/max recurrences allowed for inloop reductions"); 8896 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 8897 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 8898 } 8899 } 8900 8901 // For each interleave group which is relevant for this (possibly trimmed) 8902 // Range, add it to the set of groups to be later applied to the VPlan and add 8903 // placeholders for its members' Recipes which we'll be replacing with a 8904 // single VPInterleaveRecipe. 8905 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 8906 auto applyIG = [IG, this](ElementCount VF) -> bool { 8907 bool Result = (VF.isVector() && // Query is illegal for VF == 1 8908 CM.getWideningDecision(IG->getInsertPos(), VF) == 8909 LoopVectorizationCostModel::CM_Interleave); 8910 // For scalable vectors, the only interleave factor currently supported 8911 // is 2 since we require the (de)interleave2 intrinsics instead of 8912 // shufflevectors. 8913 assert((!Result || !VF.isScalable() || IG->getFactor() == 2) && 8914 "Unsupported interleave factor for scalable vectors"); 8915 return Result; 8916 }; 8917 if (!getDecisionAndClampRange(applyIG, Range)) 8918 continue; 8919 InterleaveGroups.insert(IG); 8920 for (unsigned i = 0; i < IG->getFactor(); i++) 8921 if (Instruction *Member = IG->getMember(i)) 8922 RecipeBuilder.recordRecipeOf(Member); 8923 }; 8924 8925 // --------------------------------------------------------------------------- 8926 // Build initial VPlan: Scan the body of the loop in a topological order to 8927 // visit each basic block after having visited its predecessor basic blocks. 8928 // --------------------------------------------------------------------------- 8929 8930 // Create initial VPlan skeleton, having a basic block for the pre-header 8931 // which contains SCEV expansions that need to happen before the CFG is 8932 // modified; a basic block for the vector pre-header, followed by a region for 8933 // the vector loop, followed by the middle basic block. The skeleton vector 8934 // loop region contains a header and latch basic blocks. 8935 VPlanPtr Plan = VPlan::createInitialVPlan( 8936 createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop), 8937 *PSE.getSE()); 8938 VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body"); 8939 VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); 8940 VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); 8941 auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop"); 8942 VPBlockUtils::insertBlockAfter(TopRegion, Plan->getEntry()); 8943 VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block"); 8944 VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion); 8945 8946 // Don't use getDecisionAndClampRange here, because we don't know the UF 8947 // so this function is better to be conservative, rather than to split 8948 // it up into different VPlans. 8949 bool IVUpdateMayOverflow = false; 8950 for (ElementCount VF : Range) 8951 IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF); 8952 8953 Instruction *DLInst = 8954 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); 8955 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), 8956 DLInst ? DLInst->getDebugLoc() : DebugLoc(), 8957 CM.getTailFoldingStyle(IVUpdateMayOverflow)); 8958 8959 // Scan the body of the loop in a topological order to visit each basic block 8960 // after having visited its predecessor basic blocks. 8961 LoopBlocksDFS DFS(OrigLoop); 8962 DFS.perform(LI); 8963 8964 VPBasicBlock *VPBB = HeaderVPBB; 8965 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 8966 // Relevant instructions from basic block BB will be grouped into VPRecipe 8967 // ingredients and fill a new VPBasicBlock. 8968 if (VPBB != HeaderVPBB) 8969 VPBB->setName(BB->getName()); 8970 Builder.setInsertPoint(VPBB); 8971 8972 // Introduce each ingredient into VPlan. 8973 // TODO: Model and preserve debug intrinsics in VPlan. 8974 for (Instruction &I : BB->instructionsWithoutDebug(false)) { 8975 Instruction *Instr = &I; 8976 8977 // First filter out irrelevant instructions, to ensure no recipes are 8978 // built for them. 8979 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 8980 continue; 8981 8982 SmallVector<VPValue *, 4> Operands; 8983 auto *Phi = dyn_cast<PHINode>(Instr); 8984 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 8985 Operands.push_back(Plan->getVPValueOrAddLiveIn( 8986 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 8987 } else { 8988 auto OpRange = Plan->mapToVPValues(Instr->operands()); 8989 Operands = {OpRange.begin(), OpRange.end()}; 8990 } 8991 8992 // Invariant stores inside loop will be deleted and a single store 8993 // with the final reduction value will be added to the exit block 8994 StoreInst *SI; 8995 if ((SI = dyn_cast<StoreInst>(&I)) && 8996 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) 8997 continue; 8998 8999 auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 9000 Instr, Operands, Range, VPBB, Plan); 9001 if (!RecipeOrValue) 9002 RecipeOrValue = RecipeBuilder.handleReplication(Instr, Range, *Plan); 9003 // If Instr can be simplified to an existing VPValue, use it. 9004 if (isa<VPValue *>(RecipeOrValue)) { 9005 auto *VPV = cast<VPValue *>(RecipeOrValue); 9006 Plan->addVPValue(Instr, VPV); 9007 // If the re-used value is a recipe, register the recipe for the 9008 // instruction, in case the recipe for Instr needs to be recorded. 9009 if (VPRecipeBase *R = VPV->getDefiningRecipe()) 9010 RecipeBuilder.setRecipe(Instr, R); 9011 continue; 9012 } 9013 // Otherwise, add the new recipe. 9014 VPRecipeBase *Recipe = cast<VPRecipeBase *>(RecipeOrValue); 9015 for (auto *Def : Recipe->definedValues()) { 9016 auto *UV = Def->getUnderlyingValue(); 9017 Plan->addVPValue(UV, Def); 9018 } 9019 9020 RecipeBuilder.setRecipe(Instr, Recipe); 9021 if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && 9022 HeaderVPBB->getFirstNonPhi() != VPBB->end()) { 9023 // Move VPWidenIntOrFpInductionRecipes for optimized truncates to the 9024 // phi section of HeaderVPBB. 9025 assert(isa<TruncInst>(Instr)); 9026 Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); 9027 } else 9028 VPBB->appendRecipe(Recipe); 9029 } 9030 9031 VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB); 9032 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor()); 9033 } 9034 9035 // After here, VPBB should not be used. 9036 VPBB = nullptr; 9037 9038 if (CM.requiresScalarEpilogue(Range)) { 9039 // No edge from the middle block to the unique exit block has been inserted 9040 // and there is nothing to fix from vector loop; phis should have incoming 9041 // from scalar loop only. 9042 } else 9043 addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan); 9044 9045 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) && 9046 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && 9047 "entry block must be set to a VPRegionBlock having a non-empty entry " 9048 "VPBasicBlock"); 9049 RecipeBuilder.fixHeaderPhis(); 9050 9051 // --------------------------------------------------------------------------- 9052 // Transform initial VPlan: Apply previously taken decisions, in order, to 9053 // bring the VPlan to its final state. 9054 // --------------------------------------------------------------------------- 9055 9056 // Adjust the recipes for any inloop reductions. 9057 adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExiting()), Plan, 9058 RecipeBuilder, Range.Start); 9059 9060 // Interleave memory: for each Interleave Group we marked earlier as relevant 9061 // for this VPlan, replace the Recipes widening its memory instructions with a 9062 // single VPInterleaveRecipe at its insertion point. 9063 for (const auto *IG : InterleaveGroups) { 9064 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 9065 RecipeBuilder.getRecipe(IG->getInsertPos())); 9066 SmallVector<VPValue *, 4> StoredValues; 9067 for (unsigned i = 0; i < IG->getFactor(); ++i) 9068 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) { 9069 auto *StoreR = 9070 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI)); 9071 StoredValues.push_back(StoreR->getStoredValue()); 9072 } 9073 9074 bool NeedsMaskForGaps = 9075 IG->requiresScalarEpilogue() && !CM.isScalarEpilogueAllowed(); 9076 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 9077 Recipe->getMask(), NeedsMaskForGaps); 9078 VPIG->insertBefore(Recipe); 9079 unsigned J = 0; 9080 for (unsigned i = 0; i < IG->getFactor(); ++i) 9081 if (Instruction *Member = IG->getMember(i)) { 9082 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member); 9083 if (!Member->getType()->isVoidTy()) { 9084 VPValue *OriginalV = MemberR->getVPSingleValue(); 9085 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 9086 J++; 9087 } 9088 MemberR->eraseFromParent(); 9089 } 9090 } 9091 9092 for (ElementCount VF : Range) 9093 Plan->addVF(VF); 9094 Plan->setName("Initial VPlan"); 9095 9096 // Replace VPValues for known constant strides guaranteed by predicate scalar 9097 // evolution. 9098 for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) { 9099 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue(); 9100 auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV)); 9101 // Only handle constant strides for now. 9102 if (!ScevStride) 9103 continue; 9104 Constant *CI = ConstantInt::get(Stride->getType(), ScevStride->getAPInt()); 9105 9106 auto *ConstVPV = Plan->getVPValueOrAddLiveIn(CI); 9107 // The versioned value may not be used in the loop directly, so just add a 9108 // new live-in in those cases. 9109 Plan->getVPValueOrAddLiveIn(StrideV)->replaceAllUsesWith(ConstVPV); 9110 } 9111 9112 // From this point onwards, VPlan-to-VPlan transformations may change the plan 9113 // in ways that accessing values using original IR values is incorrect. 9114 Plan->disableValue2VPValue(); 9115 9116 // Sink users of fixed-order recurrence past the recipe defining the previous 9117 // value and introduce FirstOrderRecurrenceSplice VPInstructions. 9118 if (!VPlanTransforms::adjustFixedOrderRecurrences(*Plan, Builder)) 9119 return std::nullopt; 9120 9121 VPlanTransforms::removeRedundantCanonicalIVs(*Plan); 9122 VPlanTransforms::removeRedundantInductionCasts(*Plan); 9123 9124 VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE()); 9125 VPlanTransforms::removeDeadRecipes(*Plan); 9126 9127 VPlanTransforms::createAndOptimizeReplicateRegions(*Plan); 9128 9129 VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan); 9130 VPlanTransforms::mergeBlocksIntoPredecessors(*Plan); 9131 9132 assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); 9133 return std::make_optional(std::move(Plan)); 9134 } 9135 9136 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9137 // Outer loop handling: They may require CFG and instruction level 9138 // transformations before even evaluating whether vectorization is profitable. 9139 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9140 // the vectorization pipeline. 9141 assert(!OrigLoop->isInnermost()); 9142 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9143 9144 // Create new empty VPlan 9145 auto Plan = VPlan::createInitialVPlan( 9146 createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop), 9147 *PSE.getSE()); 9148 9149 // Build hierarchical CFG 9150 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9151 HCFGBuilder.buildHierarchicalCFG(); 9152 9153 for (ElementCount VF : Range) 9154 Plan->addVF(VF); 9155 9156 VPlanTransforms::VPInstructionsToVPRecipes( 9157 Plan, 9158 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, 9159 *PSE.getSE(), *TLI); 9160 9161 // Remove the existing terminator of the exiting block of the top-most region. 9162 // A BranchOnCount will be added instead when adding the canonical IV recipes. 9163 auto *Term = 9164 Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator(); 9165 Term->eraseFromParent(); 9166 9167 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), 9168 CM.getTailFoldingStyle()); 9169 return Plan; 9170 } 9171 9172 // Adjust the recipes for reductions. For in-loop reductions the chain of 9173 // instructions leading from the loop exit instr to the phi need to be converted 9174 // to reductions, with one operand being vector and the other being the scalar 9175 // reduction chain. For other reductions, a select is introduced between the phi 9176 // and live-out recipes when folding the tail. 9177 void LoopVectorizationPlanner::adjustRecipesForReductions( 9178 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, 9179 ElementCount MinVF) { 9180 for (const auto &Reduction : CM.getInLoopReductionChains()) { 9181 PHINode *Phi = Reduction.first; 9182 const RecurrenceDescriptor &RdxDesc = 9183 Legal->getReductionVars().find(Phi)->second; 9184 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9185 9186 if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc)) 9187 continue; 9188 9189 // ReductionOperations are orders top-down from the phi's use to the 9190 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9191 // which of the two operands will remain scalar and which will be reduced. 9192 // For minmax the chain will be the select instructions. 9193 Instruction *Chain = Phi; 9194 for (Instruction *R : ReductionOperations) { 9195 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9196 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9197 9198 VPValue *ChainOp = Plan->getVPValue(Chain); 9199 unsigned FirstOpId; 9200 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9201 "Only min/max recurrences allowed for inloop reductions"); 9202 // Recognize a call to the llvm.fmuladd intrinsic. 9203 bool IsFMulAdd = (Kind == RecurKind::FMulAdd); 9204 assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) && 9205 "Expected instruction to be a call to the llvm.fmuladd intrinsic"); 9206 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9207 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9208 "Expected to replace a VPWidenSelectSC"); 9209 FirstOpId = 1; 9210 } else { 9211 assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) || 9212 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) && 9213 "Expected to replace a VPWidenSC"); 9214 FirstOpId = 0; 9215 } 9216 unsigned VecOpId = 9217 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9218 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9219 9220 VPValue *CondOp = nullptr; 9221 if (CM.blockNeedsPredicationForAnyReason(R->getParent())) { 9222 VPBuilder::InsertPointGuard Guard(Builder); 9223 Builder.setInsertPoint(WidenRecipe->getParent(), 9224 WidenRecipe->getIterator()); 9225 CondOp = RecipeBuilder.createBlockInMask(R->getParent(), *Plan); 9226 } 9227 9228 if (IsFMulAdd) { 9229 // If the instruction is a call to the llvm.fmuladd intrinsic then we 9230 // need to create an fmul recipe to use as the vector operand for the 9231 // fadd reduction. 9232 VPInstruction *FMulRecipe = new VPInstruction( 9233 Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))}); 9234 FMulRecipe->setFastMathFlags(R->getFastMathFlags()); 9235 WidenRecipe->getParent()->insert(FMulRecipe, 9236 WidenRecipe->getIterator()); 9237 VecOp = FMulRecipe; 9238 } 9239 VPReductionRecipe *RedRecipe = 9240 new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, &TTI); 9241 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9242 Plan->removeVPValueFor(R); 9243 Plan->addVPValue(R, RedRecipe); 9244 // Append the recipe to the end of the VPBasicBlock because we need to 9245 // ensure that it comes after all of it's inputs, including CondOp. 9246 WidenRecipe->getParent()->appendRecipe(RedRecipe); 9247 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9248 WidenRecipe->eraseFromParent(); 9249 9250 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9251 VPRecipeBase *CompareRecipe = 9252 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9253 assert(isa<VPWidenRecipe>(CompareRecipe) && 9254 "Expected to replace a VPWidenSC"); 9255 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9256 "Expected no remaining users"); 9257 CompareRecipe->eraseFromParent(); 9258 } 9259 Chain = R; 9260 } 9261 } 9262 9263 // If tail is folded by masking, introduce selects between the phi 9264 // and the live-out instruction of each reduction, at the beginning of the 9265 // dedicated latch block. 9266 if (CM.foldTailByMasking()) { 9267 Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin()); 9268 for (VPRecipeBase &R : 9269 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 9270 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9271 if (!PhiR || PhiR->isInLoop()) 9272 continue; 9273 VPValue *Cond = 9274 RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), *Plan); 9275 VPValue *Red = PhiR->getBackedgeValue(); 9276 assert(Red->getDefiningRecipe()->getParent() != LatchVPBB && 9277 "reduction recipe must be defined before latch"); 9278 Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); 9279 } 9280 } 9281 9282 VPlanTransforms::clearReductionWrapFlags(*Plan); 9283 } 9284 9285 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9286 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9287 VPSlotTracker &SlotTracker) const { 9288 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9289 IG->getInsertPos()->printAsOperand(O, false); 9290 O << ", "; 9291 getAddr()->printAsOperand(O, SlotTracker); 9292 VPValue *Mask = getMask(); 9293 if (Mask) { 9294 O << ", "; 9295 Mask->printAsOperand(O, SlotTracker); 9296 } 9297 9298 unsigned OpIdx = 0; 9299 for (unsigned i = 0; i < IG->getFactor(); ++i) { 9300 if (!IG->getMember(i)) 9301 continue; 9302 if (getNumStoreOperands() > 0) { 9303 O << "\n" << Indent << " store "; 9304 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 9305 O << " to index " << i; 9306 } else { 9307 O << "\n" << Indent << " "; 9308 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 9309 O << " = load from index " << i; 9310 } 9311 ++OpIdx; 9312 } 9313 } 9314 #endif 9315 9316 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9317 assert(!State.Instance && "Int or FP induction being replicated."); 9318 9319 Value *Start = getStartValue()->getLiveInIRValue(); 9320 const InductionDescriptor &ID = getInductionDescriptor(); 9321 TruncInst *Trunc = getTruncInst(); 9322 IRBuilderBase &Builder = State.Builder; 9323 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 9324 assert(State.VF.isVector() && "must have vector VF"); 9325 9326 // The value from the original loop to which we are mapping the new induction 9327 // variable. 9328 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 9329 9330 // Fast-math-flags propagate from the original induction instruction. 9331 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 9332 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 9333 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 9334 9335 // Now do the actual transformations, and start with fetching the step value. 9336 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 9337 9338 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 9339 "Expected either an induction phi-node or a truncate of it!"); 9340 9341 // Construct the initial value of the vector IV in the vector loop preheader 9342 auto CurrIP = Builder.saveIP(); 9343 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 9344 Builder.SetInsertPoint(VectorPH->getTerminator()); 9345 if (isa<TruncInst>(EntryVal)) { 9346 assert(Start->getType()->isIntegerTy() && 9347 "Truncation requires an integer type"); 9348 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 9349 Step = Builder.CreateTrunc(Step, TruncType); 9350 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 9351 } 9352 9353 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); 9354 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); 9355 Value *SteppedStart = getStepVector( 9356 SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder); 9357 9358 // We create vector phi nodes for both integer and floating-point induction 9359 // variables. Here, we determine the kind of arithmetic we will perform. 9360 Instruction::BinaryOps AddOp; 9361 Instruction::BinaryOps MulOp; 9362 if (Step->getType()->isIntegerTy()) { 9363 AddOp = Instruction::Add; 9364 MulOp = Instruction::Mul; 9365 } else { 9366 AddOp = ID.getInductionOpcode(); 9367 MulOp = Instruction::FMul; 9368 } 9369 9370 // Multiply the vectorization factor by the step using integer or 9371 // floating-point arithmetic as appropriate. 9372 Type *StepType = Step->getType(); 9373 Value *RuntimeVF; 9374 if (Step->getType()->isFloatingPointTy()) 9375 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); 9376 else 9377 RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); 9378 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 9379 9380 // Create a vector splat to use in the induction update. 9381 // 9382 // FIXME: If the step is non-constant, we create the vector splat with 9383 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 9384 // handle a constant vector splat. 9385 Value *SplatVF = isa<Constant>(Mul) 9386 ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) 9387 : Builder.CreateVectorSplat(State.VF, Mul); 9388 Builder.restoreIP(CurrIP); 9389 9390 // We may need to add the step a number of times, depending on the unroll 9391 // factor. The last of those goes into the PHI. 9392 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 9393 &*State.CFG.PrevBB->getFirstInsertionPt()); 9394 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 9395 Instruction *LastInduction = VecInd; 9396 for (unsigned Part = 0; Part < State.UF; ++Part) { 9397 State.set(this, LastInduction, Part); 9398 9399 if (isa<TruncInst>(EntryVal)) 9400 State.addMetadata(LastInduction, EntryVal); 9401 9402 LastInduction = cast<Instruction>( 9403 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 9404 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 9405 } 9406 9407 LastInduction->setName("vec.ind.next"); 9408 VecInd->addIncoming(SteppedStart, VectorPH); 9409 // Add induction update using an incorrect block temporarily. The phi node 9410 // will be fixed after VPlan execution. Note that at this point the latch 9411 // block cannot be used, as it does not exist yet. 9412 // TODO: Model increment value in VPlan, by turning the recipe into a 9413 // multi-def and a subclass of VPHeaderPHIRecipe. 9414 VecInd->addIncoming(LastInduction, VectorPH); 9415 } 9416 9417 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { 9418 assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction && 9419 "Not a pointer induction according to InductionDescriptor!"); 9420 assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() && 9421 "Unexpected type."); 9422 9423 auto *IVR = getParent()->getPlan()->getCanonicalIV(); 9424 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0)); 9425 9426 if (onlyScalarsGenerated(State.VF)) { 9427 // This is the normalized GEP that starts counting at zero. 9428 Value *PtrInd = State.Builder.CreateSExtOrTrunc( 9429 CanonicalIV, IndDesc.getStep()->getType()); 9430 // Determine the number of scalars we need to generate for each unroll 9431 // iteration. If the instruction is uniform, we only need to generate the 9432 // first lane. Otherwise, we generate all VF values. 9433 bool IsUniform = vputils::onlyFirstLaneUsed(this); 9434 assert((IsUniform || !State.VF.isScalable()) && 9435 "Cannot scalarize a scalable VF"); 9436 unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); 9437 9438 for (unsigned Part = 0; Part < State.UF; ++Part) { 9439 Value *PartStart = 9440 createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part); 9441 9442 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 9443 Value *Idx = State.Builder.CreateAdd( 9444 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 9445 Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx); 9446 9447 Value *Step = State.get(getOperand(1), VPIteration(Part, Lane)); 9448 Value *SclrGep = emitTransformedIndex( 9449 State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, IndDesc); 9450 SclrGep->setName("next.gep"); 9451 State.set(this, SclrGep, VPIteration(Part, Lane)); 9452 } 9453 } 9454 return; 9455 } 9456 9457 Type *PhiType = IndDesc.getStep()->getType(); 9458 9459 // Build a pointer phi 9460 Value *ScalarStartValue = getStartValue()->getLiveInIRValue(); 9461 Type *ScStValueType = ScalarStartValue->getType(); 9462 PHINode *NewPointerPhi = 9463 PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV); 9464 9465 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 9466 NewPointerPhi->addIncoming(ScalarStartValue, VectorPH); 9467 9468 // A pointer induction, performed by using a gep 9469 Instruction *InductionLoc = &*State.Builder.GetInsertPoint(); 9470 9471 Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0)); 9472 Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF); 9473 Value *NumUnrolledElems = 9474 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 9475 Value *InductionGEP = GetElementPtrInst::Create( 9476 State.Builder.getInt8Ty(), NewPointerPhi, 9477 State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 9478 InductionLoc); 9479 // Add induction update using an incorrect block temporarily. The phi node 9480 // will be fixed after VPlan execution. Note that at this point the latch 9481 // block cannot be used, as it does not exist yet. 9482 // TODO: Model increment value in VPlan, by turning the recipe into a 9483 // multi-def and a subclass of VPHeaderPHIRecipe. 9484 NewPointerPhi->addIncoming(InductionGEP, VectorPH); 9485 9486 // Create UF many actual address geps that use the pointer 9487 // phi as base and a vectorized version of the step value 9488 // (<step*0, ..., step*N>) as offset. 9489 for (unsigned Part = 0; Part < State.UF; ++Part) { 9490 Type *VecPhiType = VectorType::get(PhiType, State.VF); 9491 Value *StartOffsetScalar = 9492 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 9493 Value *StartOffset = 9494 State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 9495 // Create a vector of consecutive numbers from zero to VF. 9496 StartOffset = State.Builder.CreateAdd( 9497 StartOffset, State.Builder.CreateStepVector(VecPhiType)); 9498 9499 assert(ScalarStepValue == State.get(getOperand(1), VPIteration(Part, 0)) && 9500 "scalar step must be the same across all parts"); 9501 Value *GEP = State.Builder.CreateGEP( 9502 State.Builder.getInt8Ty(), NewPointerPhi, 9503 State.Builder.CreateMul( 9504 StartOffset, 9505 State.Builder.CreateVectorSplat(State.VF, ScalarStepValue), 9506 "vector.gep")); 9507 State.set(this, GEP, Part); 9508 } 9509 } 9510 9511 void VPDerivedIVRecipe::execute(VPTransformState &State) { 9512 assert(!State.Instance && "VPDerivedIVRecipe being replicated."); 9513 9514 // Fast-math-flags propagate from the original induction instruction. 9515 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 9516 if (IndDesc.getInductionBinOp() && 9517 isa<FPMathOperator>(IndDesc.getInductionBinOp())) 9518 State.Builder.setFastMathFlags( 9519 IndDesc.getInductionBinOp()->getFastMathFlags()); 9520 9521 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 9522 Value *CanonicalIV = State.get(getCanonicalIV(), VPIteration(0, 0)); 9523 Value *DerivedIV = 9524 emitTransformedIndex(State.Builder, CanonicalIV, 9525 getStartValue()->getLiveInIRValue(), Step, IndDesc); 9526 DerivedIV->setName("offset.idx"); 9527 if (ResultTy != DerivedIV->getType()) { 9528 assert(Step->getType()->isIntegerTy() && 9529 "Truncation requires an integer step"); 9530 DerivedIV = State.Builder.CreateTrunc(DerivedIV, ResultTy); 9531 } 9532 assert(DerivedIV != CanonicalIV && "IV didn't need transforming?"); 9533 9534 State.set(this, DerivedIV, VPIteration(0, 0)); 9535 } 9536 9537 void VPScalarIVStepsRecipe::execute(VPTransformState &State) { 9538 // Fast-math-flags propagate from the original induction instruction. 9539 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 9540 if (IndDesc.getInductionBinOp() && 9541 isa<FPMathOperator>(IndDesc.getInductionBinOp())) 9542 State.Builder.setFastMathFlags( 9543 IndDesc.getInductionBinOp()->getFastMathFlags()); 9544 9545 Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0)); 9546 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 9547 9548 buildScalarSteps(BaseIV, Step, IndDesc, this, State); 9549 } 9550 9551 void VPInterleaveRecipe::execute(VPTransformState &State) { 9552 assert(!State.Instance && "Interleave group being replicated."); 9553 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9554 getStoredValues(), getMask(), 9555 NeedsMaskForGaps); 9556 } 9557 9558 void VPReductionRecipe::execute(VPTransformState &State) { 9559 assert(!State.Instance && "Reduction being replicated."); 9560 Value *PrevInChain = State.get(getChainOp(), 0); 9561 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9562 bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc); 9563 // Propagate the fast-math flags carried by the underlying instruction. 9564 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 9565 State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags()); 9566 for (unsigned Part = 0; Part < State.UF; ++Part) { 9567 Value *NewVecOp = State.get(getVecOp(), Part); 9568 if (VPValue *Cond = getCondOp()) { 9569 Value *NewCond = State.get(Cond, Part); 9570 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9571 Value *Iden = RdxDesc->getRecurrenceIdentity( 9572 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9573 Value *IdenVec = 9574 State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); 9575 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9576 NewVecOp = Select; 9577 } 9578 Value *NewRed; 9579 Value *NextInChain; 9580 if (IsOrdered) { 9581 if (State.VF.isVector()) 9582 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9583 PrevInChain); 9584 else 9585 NewRed = State.Builder.CreateBinOp( 9586 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain, 9587 NewVecOp); 9588 PrevInChain = NewRed; 9589 } else { 9590 PrevInChain = State.get(getChainOp(), Part); 9591 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9592 } 9593 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9594 NextInChain = 9595 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9596 NewRed, PrevInChain); 9597 } else if (IsOrdered) 9598 NextInChain = NewRed; 9599 else 9600 NextInChain = State.Builder.CreateBinOp( 9601 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed, 9602 PrevInChain); 9603 State.set(this, NextInChain, Part); 9604 } 9605 } 9606 9607 void VPReplicateRecipe::execute(VPTransformState &State) { 9608 Instruction *UI = getUnderlyingInstr(); 9609 if (State.Instance) { // Generate a single instance. 9610 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9611 State.ILV->scalarizeInstruction(UI, this, *State.Instance, State); 9612 // Insert scalar instance packing it into a vector. 9613 if (State.VF.isVector() && shouldPack()) { 9614 // If we're constructing lane 0, initialize to start from poison. 9615 if (State.Instance->Lane.isFirstLane()) { 9616 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9617 Value *Poison = PoisonValue::get( 9618 VectorType::get(UI->getType(), State.VF)); 9619 State.set(this, Poison, State.Instance->Part); 9620 } 9621 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9622 } 9623 return; 9624 } 9625 9626 if (IsUniform) { 9627 // If the recipe is uniform across all parts (instead of just per VF), only 9628 // generate a single instance. 9629 if ((isa<LoadInst>(UI) || isa<StoreInst>(UI)) && 9630 all_of(operands(), [](VPValue *Op) { 9631 return Op->isDefinedOutsideVectorRegions(); 9632 })) { 9633 State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), State); 9634 if (user_begin() != user_end()) { 9635 for (unsigned Part = 1; Part < State.UF; ++Part) 9636 State.set(this, State.get(this, VPIteration(0, 0)), 9637 VPIteration(Part, 0)); 9638 } 9639 return; 9640 } 9641 9642 // Uniform within VL means we need to generate lane 0 only for each 9643 // unrolled copy. 9644 for (unsigned Part = 0; Part < State.UF; ++Part) 9645 State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0), State); 9646 return; 9647 } 9648 9649 // A store of a loop varying value to a uniform address only needs the last 9650 // copy of the store. 9651 if (isa<StoreInst>(UI) && 9652 vputils::isUniformAfterVectorization(getOperand(1))) { 9653 auto Lane = VPLane::getLastLaneForVF(State.VF); 9654 State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane), 9655 State); 9656 return; 9657 } 9658 9659 // Generate scalar instances for all VF lanes of all UF parts. 9660 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9661 const unsigned EndLane = State.VF.getKnownMinValue(); 9662 for (unsigned Part = 0; Part < State.UF; ++Part) 9663 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9664 State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State); 9665 } 9666 9667 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9668 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9669 9670 // Attempt to issue a wide load. 9671 LoadInst *LI = dyn_cast<LoadInst>(&Ingredient); 9672 StoreInst *SI = dyn_cast<StoreInst>(&Ingredient); 9673 9674 assert((LI || SI) && "Invalid Load/Store instruction"); 9675 assert((!SI || StoredValue) && "No stored value provided for widened store"); 9676 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 9677 9678 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 9679 9680 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 9681 const Align Alignment = getLoadStoreAlignment(&Ingredient); 9682 bool CreateGatherScatter = !isConsecutive(); 9683 9684 auto &Builder = State.Builder; 9685 InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF); 9686 bool isMaskRequired = getMask(); 9687 if (isMaskRequired) 9688 for (unsigned Part = 0; Part < State.UF; ++Part) 9689 BlockInMaskParts[Part] = State.get(getMask(), Part); 9690 9691 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 9692 // Calculate the pointer for the specific unroll-part. 9693 Value *PartPtr = nullptr; 9694 9695 // Use i32 for the gep index type when the value is constant, 9696 // or query DataLayout for a more suitable index type otherwise. 9697 const DataLayout &DL = 9698 Builder.GetInsertBlock()->getModule()->getDataLayout(); 9699 Type *IndexTy = State.VF.isScalable() && (isReverse() || Part > 0) 9700 ? DL.getIndexType(ScalarDataTy->getPointerTo()) 9701 : Builder.getInt32Ty(); 9702 bool InBounds = false; 9703 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 9704 InBounds = gep->isInBounds(); 9705 if (isReverse()) { 9706 // If the address is consecutive but reversed, then the 9707 // wide store needs to start at the last vector element. 9708 // RunTimeVF = VScale * VF.getKnownMinValue() 9709 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 9710 Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF); 9711 // NumElt = -Part * RunTimeVF 9712 Value *NumElt = 9713 Builder.CreateMul(ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF); 9714 // LastLane = 1 - RunTimeVF 9715 Value *LastLane = 9716 Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF); 9717 PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, NumElt, "", InBounds); 9718 PartPtr = 9719 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane, "", InBounds); 9720 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 9721 BlockInMaskParts[Part] = 9722 Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse"); 9723 } else { 9724 Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part); 9725 PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, Increment, "", InBounds); 9726 } 9727 9728 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 9729 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 9730 }; 9731 9732 // Handle Stores: 9733 if (SI) { 9734 State.setDebugLocFromInst(SI); 9735 9736 for (unsigned Part = 0; Part < State.UF; ++Part) { 9737 Instruction *NewSI = nullptr; 9738 Value *StoredVal = State.get(StoredValue, Part); 9739 if (CreateGatherScatter) { 9740 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 9741 Value *VectorGep = State.get(getAddr(), Part); 9742 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 9743 MaskPart); 9744 } else { 9745 if (isReverse()) { 9746 // If we store to reverse consecutive memory locations, then we need 9747 // to reverse the order of elements in the stored value. 9748 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); 9749 // We don't want to update the value in the map as it might be used in 9750 // another expression. So don't call resetVectorValue(StoredVal). 9751 } 9752 auto *VecPtr = 9753 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 9754 if (isMaskRequired) 9755 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 9756 BlockInMaskParts[Part]); 9757 else 9758 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 9759 } 9760 State.addMetadata(NewSI, SI); 9761 } 9762 return; 9763 } 9764 9765 // Handle loads. 9766 assert(LI && "Must have a load instruction"); 9767 State.setDebugLocFromInst(LI); 9768 for (unsigned Part = 0; Part < State.UF; ++Part) { 9769 Value *NewLI; 9770 if (CreateGatherScatter) { 9771 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 9772 Value *VectorGep = State.get(getAddr(), Part); 9773 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, 9774 nullptr, "wide.masked.gather"); 9775 State.addMetadata(NewLI, LI); 9776 } else { 9777 auto *VecPtr = 9778 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 9779 if (isMaskRequired) 9780 NewLI = Builder.CreateMaskedLoad( 9781 DataTy, VecPtr, Alignment, BlockInMaskParts[Part], 9782 PoisonValue::get(DataTy), "wide.masked.load"); 9783 else 9784 NewLI = 9785 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 9786 9787 // Add metadata to the load, but setVectorValue to the reverse shuffle. 9788 State.addMetadata(NewLI, LI); 9789 if (Reverse) 9790 NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); 9791 } 9792 9793 State.set(getVPSingleValue(), NewLI, Part); 9794 } 9795 } 9796 9797 // Determine how to lower the scalar epilogue, which depends on 1) optimising 9798 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 9799 // predication, and 4) a TTI hook that analyses whether the loop is suitable 9800 // for predication. 9801 static ScalarEpilogueLowering getScalarEpilogueLowering( 9802 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 9803 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 9804 LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) { 9805 // 1) OptSize takes precedence over all other options, i.e. if this is set, 9806 // don't look at hints or options, and don't request a scalar epilogue. 9807 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 9808 // LoopAccessInfo (due to code dependency and not being able to reliably get 9809 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 9810 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 9811 // versioning when the vectorization is forced, unlike hasOptSize. So revert 9812 // back to the old way and vectorize with versioning when forced. See D81345.) 9813 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 9814 PGSOQueryType::IRPass) && 9815 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 9816 return CM_ScalarEpilogueNotAllowedOptSize; 9817 9818 // 2) If set, obey the directives 9819 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 9820 switch (PreferPredicateOverEpilogue) { 9821 case PreferPredicateTy::ScalarEpilogue: 9822 return CM_ScalarEpilogueAllowed; 9823 case PreferPredicateTy::PredicateElseScalarEpilogue: 9824 return CM_ScalarEpilogueNotNeededUsePredicate; 9825 case PreferPredicateTy::PredicateOrDontVectorize: 9826 return CM_ScalarEpilogueNotAllowedUsePredicate; 9827 }; 9828 } 9829 9830 // 3) If set, obey the hints 9831 switch (Hints.getPredicate()) { 9832 case LoopVectorizeHints::FK_Enabled: 9833 return CM_ScalarEpilogueNotNeededUsePredicate; 9834 case LoopVectorizeHints::FK_Disabled: 9835 return CM_ScalarEpilogueAllowed; 9836 }; 9837 9838 // 4) if the TTI hook indicates this is profitable, request predication. 9839 TailFoldingInfo TFI(TLI, &LVL, IAI); 9840 if (TTI->preferPredicateOverEpilogue(&TFI)) 9841 return CM_ScalarEpilogueNotNeededUsePredicate; 9842 9843 return CM_ScalarEpilogueAllowed; 9844 } 9845 9846 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 9847 // If Values have been set for this Def return the one relevant for \p Part. 9848 if (hasVectorValue(Def, Part)) 9849 return Data.PerPartOutput[Def][Part]; 9850 9851 auto GetBroadcastInstrs = [this, Def](Value *V) { 9852 bool SafeToHoist = Def->isDefinedOutsideVectorRegions(); 9853 if (VF.isScalar()) 9854 return V; 9855 // Place the code for broadcasting invariant variables in the new preheader. 9856 IRBuilder<>::InsertPointGuard Guard(Builder); 9857 if (SafeToHoist) { 9858 BasicBlock *LoopVectorPreHeader = CFG.VPBB2IRBB[cast<VPBasicBlock>( 9859 Plan->getVectorLoopRegion()->getSinglePredecessor())]; 9860 if (LoopVectorPreHeader) 9861 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 9862 } 9863 9864 // Place the code for broadcasting invariant variables in the new preheader. 9865 // Broadcast the scalar into all locations in the vector. 9866 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 9867 9868 return Shuf; 9869 }; 9870 9871 if (!hasScalarValue(Def, {Part, 0})) { 9872 Value *IRV = Def->getLiveInIRValue(); 9873 Value *B = GetBroadcastInstrs(IRV); 9874 set(Def, B, Part); 9875 return B; 9876 } 9877 9878 Value *ScalarValue = get(Def, {Part, 0}); 9879 // If we aren't vectorizing, we can just copy the scalar map values over 9880 // to the vector map. 9881 if (VF.isScalar()) { 9882 set(Def, ScalarValue, Part); 9883 return ScalarValue; 9884 } 9885 9886 bool IsUniform = vputils::isUniformAfterVectorization(Def); 9887 9888 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 9889 // Check if there is a scalar value for the selected lane. 9890 if (!hasScalarValue(Def, {Part, LastLane})) { 9891 // At the moment, VPWidenIntOrFpInductionRecipes, VPScalarIVStepsRecipes and 9892 // VPExpandSCEVRecipes can also be uniform. 9893 assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDefiningRecipe()) || 9894 isa<VPScalarIVStepsRecipe>(Def->getDefiningRecipe()) || 9895 isa<VPExpandSCEVRecipe>(Def->getDefiningRecipe())) && 9896 "unexpected recipe found to be invariant"); 9897 IsUniform = true; 9898 LastLane = 0; 9899 } 9900 9901 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 9902 // Set the insert point after the last scalarized instruction or after the 9903 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence 9904 // will directly follow the scalar definitions. 9905 auto OldIP = Builder.saveIP(); 9906 auto NewIP = 9907 isa<PHINode>(LastInst) 9908 ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI()) 9909 : std::next(BasicBlock::iterator(LastInst)); 9910 Builder.SetInsertPoint(&*NewIP); 9911 9912 // However, if we are vectorizing, we need to construct the vector values. 9913 // If the value is known to be uniform after vectorization, we can just 9914 // broadcast the scalar value corresponding to lane zero for each unroll 9915 // iteration. Otherwise, we construct the vector values using 9916 // insertelement instructions. Since the resulting vectors are stored in 9917 // State, we will only generate the insertelements once. 9918 Value *VectorValue = nullptr; 9919 if (IsUniform) { 9920 VectorValue = GetBroadcastInstrs(ScalarValue); 9921 set(Def, VectorValue, Part); 9922 } else { 9923 // Initialize packing with insertelements to start from undef. 9924 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 9925 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 9926 set(Def, Undef, Part); 9927 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 9928 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 9929 VectorValue = get(Def, Part); 9930 } 9931 Builder.restoreIP(OldIP); 9932 return VectorValue; 9933 } 9934 9935 // Process the loop in the VPlan-native vectorization path. This path builds 9936 // VPlan upfront in the vectorization pipeline, which allows to apply 9937 // VPlan-to-VPlan transformations from the very beginning without modifying the 9938 // input LLVM IR. 9939 static bool processLoopInVPlanNativePath( 9940 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 9941 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 9942 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 9943 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 9944 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 9945 LoopVectorizationRequirements &Requirements) { 9946 9947 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 9948 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 9949 return false; 9950 } 9951 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 9952 Function *F = L->getHeader()->getParent(); 9953 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 9954 9955 ScalarEpilogueLowering SEL = 9956 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI); 9957 9958 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 9959 &Hints, IAI); 9960 // Use the planner for outer loop vectorization. 9961 // TODO: CM is not used at this point inside the planner. Turn CM into an 9962 // optional argument if we don't need it in the future. 9963 LoopVectorizationPlanner LVP(L, LI, TLI, *TTI, LVL, CM, IAI, PSE, Hints, ORE); 9964 9965 // Get user vectorization factor. 9966 ElementCount UserVF = Hints.getWidth(); 9967 9968 CM.collectElementTypesForWidening(); 9969 9970 // Plan how to best vectorize, return the best VF and its cost. 9971 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 9972 9973 // If we are stress testing VPlan builds, do not attempt to generate vector 9974 // code. Masked vector code generation support will follow soon. 9975 // Also, do not attempt to vectorize if no vector code will be produced. 9976 if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF) 9977 return false; 9978 9979 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 9980 9981 { 9982 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, 9983 F->getParent()->getDataLayout()); 9984 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 9985 VF.Width, 1, LVL, &CM, BFI, PSI, Checks); 9986 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 9987 << L->getHeader()->getParent()->getName() << "\"\n"); 9988 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false); 9989 } 9990 9991 // Mark the loop as already vectorized to avoid vectorizing again. 9992 Hints.setAlreadyVectorized(); 9993 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9994 return true; 9995 } 9996 9997 // Emit a remark if there are stores to floats that required a floating point 9998 // extension. If the vectorized loop was generated with floating point there 9999 // will be a performance penalty from the conversion overhead and the change in 10000 // the vector width. 10001 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 10002 SmallVector<Instruction *, 4> Worklist; 10003 for (BasicBlock *BB : L->getBlocks()) { 10004 for (Instruction &Inst : *BB) { 10005 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 10006 if (S->getValueOperand()->getType()->isFloatTy()) 10007 Worklist.push_back(S); 10008 } 10009 } 10010 } 10011 10012 // Traverse the floating point stores upwards searching, for floating point 10013 // conversions. 10014 SmallPtrSet<const Instruction *, 4> Visited; 10015 SmallPtrSet<const Instruction *, 4> EmittedRemark; 10016 while (!Worklist.empty()) { 10017 auto *I = Worklist.pop_back_val(); 10018 if (!L->contains(I)) 10019 continue; 10020 if (!Visited.insert(I).second) 10021 continue; 10022 10023 // Emit a remark if the floating point store required a floating 10024 // point conversion. 10025 // TODO: More work could be done to identify the root cause such as a 10026 // constant or a function return type and point the user to it. 10027 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 10028 ORE->emit([&]() { 10029 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 10030 I->getDebugLoc(), L->getHeader()) 10031 << "floating point conversion changes vector width. " 10032 << "Mixed floating point precision requires an up/down " 10033 << "cast that will negatively impact performance."; 10034 }); 10035 10036 for (Use &Op : I->operands()) 10037 if (auto *OpI = dyn_cast<Instruction>(Op)) 10038 Worklist.push_back(OpI); 10039 } 10040 } 10041 10042 static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, 10043 VectorizationFactor &VF, 10044 std::optional<unsigned> VScale, Loop *L, 10045 ScalarEvolution &SE) { 10046 InstructionCost CheckCost = Checks.getCost(); 10047 if (!CheckCost.isValid()) 10048 return false; 10049 10050 // When interleaving only scalar and vector cost will be equal, which in turn 10051 // would lead to a divide by 0. Fall back to hard threshold. 10052 if (VF.Width.isScalar()) { 10053 if (CheckCost > VectorizeMemoryCheckThreshold) { 10054 LLVM_DEBUG( 10055 dbgs() 10056 << "LV: Interleaving only is not profitable due to runtime checks\n"); 10057 return false; 10058 } 10059 return true; 10060 } 10061 10062 // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated. 10063 double ScalarC = *VF.ScalarCost.getValue(); 10064 if (ScalarC == 0) 10065 return true; 10066 10067 // First, compute the minimum iteration count required so that the vector 10068 // loop outperforms the scalar loop. 10069 // The total cost of the scalar loop is 10070 // ScalarC * TC 10071 // where 10072 // * TC is the actual trip count of the loop. 10073 // * ScalarC is the cost of a single scalar iteration. 10074 // 10075 // The total cost of the vector loop is 10076 // RtC + VecC * (TC / VF) + EpiC 10077 // where 10078 // * RtC is the cost of the generated runtime checks 10079 // * VecC is the cost of a single vector iteration. 10080 // * TC is the actual trip count of the loop 10081 // * VF is the vectorization factor 10082 // * EpiCost is the cost of the generated epilogue, including the cost 10083 // of the remaining scalar operations. 10084 // 10085 // Vectorization is profitable once the total vector cost is less than the 10086 // total scalar cost: 10087 // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC 10088 // 10089 // Now we can compute the minimum required trip count TC as 10090 // (RtC + EpiC) / (ScalarC - (VecC / VF)) < TC 10091 // 10092 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that 10093 // the computations are performed on doubles, not integers and the result 10094 // is rounded up, hence we get an upper estimate of the TC. 10095 unsigned IntVF = VF.Width.getKnownMinValue(); 10096 if (VF.Width.isScalable()) { 10097 unsigned AssumedMinimumVscale = 1; 10098 if (VScale) 10099 AssumedMinimumVscale = *VScale; 10100 IntVF *= AssumedMinimumVscale; 10101 } 10102 double VecCOverVF = double(*VF.Cost.getValue()) / IntVF; 10103 double RtC = *CheckCost.getValue(); 10104 double MinTC1 = RtC / (ScalarC - VecCOverVF); 10105 10106 // Second, compute a minimum iteration count so that the cost of the 10107 // runtime checks is only a fraction of the total scalar loop cost. This 10108 // adds a loop-dependent bound on the overhead incurred if the runtime 10109 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC 10110 // * TC. To bound the runtime check to be a fraction 1/X of the scalar 10111 // cost, compute 10112 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC 10113 double MinTC2 = RtC * 10 / ScalarC; 10114 10115 // Now pick the larger minimum. If it is not a multiple of VF, choose the 10116 // next closest multiple of VF. This should partly compensate for ignoring 10117 // the epilogue cost. 10118 uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2)); 10119 VF.MinProfitableTripCount = ElementCount::getFixed(alignTo(MinTC, IntVF)); 10120 10121 LLVM_DEBUG( 10122 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:" 10123 << VF.MinProfitableTripCount << "\n"); 10124 10125 // Skip vectorization if the expected trip count is less than the minimum 10126 // required trip count. 10127 if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) { 10128 if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC), 10129 VF.MinProfitableTripCount)) { 10130 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected " 10131 "trip count < minimum profitable VF (" 10132 << *ExpectedTC << " < " << VF.MinProfitableTripCount 10133 << ")\n"); 10134 10135 return false; 10136 } 10137 } 10138 return true; 10139 } 10140 10141 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 10142 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 10143 !EnableLoopInterleaving), 10144 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 10145 !EnableLoopVectorization) {} 10146 10147 bool LoopVectorizePass::processLoop(Loop *L) { 10148 assert((EnableVPlanNativePath || L->isInnermost()) && 10149 "VPlan-native path is not enabled. Only process inner loops."); 10150 10151 #ifndef NDEBUG 10152 const std::string DebugLocStr = getDebugLocString(L); 10153 #endif /* NDEBUG */ 10154 10155 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '" 10156 << L->getHeader()->getParent()->getName() << "' from " 10157 << DebugLocStr << "\n"); 10158 10159 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI); 10160 10161 LLVM_DEBUG( 10162 dbgs() << "LV: Loop hints:" 10163 << " force=" 10164 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 10165 ? "disabled" 10166 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 10167 ? "enabled" 10168 : "?")) 10169 << " width=" << Hints.getWidth() 10170 << " interleave=" << Hints.getInterleave() << "\n"); 10171 10172 // Function containing loop 10173 Function *F = L->getHeader()->getParent(); 10174 10175 // Looking at the diagnostic output is the only way to determine if a loop 10176 // was vectorized (other than looking at the IR or machine code), so it 10177 // is important to generate an optimization remark for each loop. Most of 10178 // these messages are generated as OptimizationRemarkAnalysis. Remarks 10179 // generated as OptimizationRemark and OptimizationRemarkMissed are 10180 // less verbose reporting vectorized loops and unvectorized loops that may 10181 // benefit from vectorization, respectively. 10182 10183 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 10184 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 10185 return false; 10186 } 10187 10188 PredicatedScalarEvolution PSE(*SE, *L); 10189 10190 // Check if it is legal to vectorize the loop. 10191 LoopVectorizationRequirements Requirements; 10192 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE, 10193 &Requirements, &Hints, DB, AC, BFI, PSI); 10194 if (!LVL.canVectorize(EnableVPlanNativePath)) { 10195 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 10196 Hints.emitRemarkWithHints(); 10197 return false; 10198 } 10199 10200 // Entrance to the VPlan-native vectorization path. Outer loops are processed 10201 // here. They may require CFG and instruction level transformations before 10202 // even evaluating whether vectorization is profitable. Since we cannot modify 10203 // the incoming IR, we need to build VPlan upfront in the vectorization 10204 // pipeline. 10205 if (!L->isInnermost()) 10206 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 10207 ORE, BFI, PSI, Hints, Requirements); 10208 10209 assert(L->isInnermost() && "Inner loop expected."); 10210 10211 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 10212 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 10213 10214 // If an override option has been passed in for interleaved accesses, use it. 10215 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 10216 UseInterleaved = EnableInterleavedMemAccesses; 10217 10218 // Analyze interleaved memory accesses. 10219 if (UseInterleaved) 10220 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 10221 10222 // Check the function attributes and profiles to find out if this function 10223 // should be optimized for size. 10224 ScalarEpilogueLowering SEL = 10225 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI); 10226 10227 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 10228 // count by optimizing for size, to minimize overheads. 10229 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 10230 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 10231 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 10232 << "This loop is worth vectorizing only if no scalar " 10233 << "iteration overheads are incurred."); 10234 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 10235 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 10236 else { 10237 if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) { 10238 LLVM_DEBUG(dbgs() << "\n"); 10239 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 10240 } else { 10241 LLVM_DEBUG(dbgs() << " But the target considers the trip count too " 10242 "small to consider vectorizing.\n"); 10243 reportVectorizationFailure( 10244 "The trip count is below the minial threshold value.", 10245 "loop trip count is too low, avoiding vectorization", 10246 "LowTripCount", ORE, L); 10247 Hints.emitRemarkWithHints(); 10248 return false; 10249 } 10250 } 10251 } 10252 10253 // Check the function attributes to see if implicit floats or vectors are 10254 // allowed. 10255 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 10256 reportVectorizationFailure( 10257 "Can't vectorize when the NoImplicitFloat attribute is used", 10258 "loop not vectorized due to NoImplicitFloat attribute", 10259 "NoImplicitFloat", ORE, L); 10260 Hints.emitRemarkWithHints(); 10261 return false; 10262 } 10263 10264 // Check if the target supports potentially unsafe FP vectorization. 10265 // FIXME: Add a check for the type of safety issue (denormal, signaling) 10266 // for the target we're vectorizing for, to make sure none of the 10267 // additional fp-math flags can help. 10268 if (Hints.isPotentiallyUnsafe() && 10269 TTI->isFPVectorizationPotentiallyUnsafe()) { 10270 reportVectorizationFailure( 10271 "Potentially unsafe FP op prevents vectorization", 10272 "loop not vectorized due to unsafe FP support.", 10273 "UnsafeFP", ORE, L); 10274 Hints.emitRemarkWithHints(); 10275 return false; 10276 } 10277 10278 bool AllowOrderedReductions; 10279 // If the flag is set, use that instead and override the TTI behaviour. 10280 if (ForceOrderedReductions.getNumOccurrences() > 0) 10281 AllowOrderedReductions = ForceOrderedReductions; 10282 else 10283 AllowOrderedReductions = TTI->enableOrderedReductions(); 10284 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) { 10285 ORE->emit([&]() { 10286 auto *ExactFPMathInst = Requirements.getExactFPInst(); 10287 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 10288 ExactFPMathInst->getDebugLoc(), 10289 ExactFPMathInst->getParent()) 10290 << "loop not vectorized: cannot prove it is safe to reorder " 10291 "floating-point operations"; 10292 }); 10293 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 10294 "reorder floating-point operations\n"); 10295 Hints.emitRemarkWithHints(); 10296 return false; 10297 } 10298 10299 // Use the cost model. 10300 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 10301 F, &Hints, IAI); 10302 // Use the planner for vectorization. 10303 LoopVectorizationPlanner LVP(L, LI, TLI, *TTI, &LVL, CM, IAI, PSE, Hints, 10304 ORE); 10305 10306 // Get user vectorization factor and interleave count. 10307 ElementCount UserVF = Hints.getWidth(); 10308 unsigned UserIC = Hints.getInterleave(); 10309 10310 // Plan how to best vectorize, return the best VF and its cost. 10311 std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 10312 10313 VectorizationFactor VF = VectorizationFactor::Disabled(); 10314 unsigned IC = 1; 10315 10316 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, 10317 F->getParent()->getDataLayout()); 10318 if (MaybeVF) { 10319 VF = *MaybeVF; 10320 // Select the interleave count. 10321 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 10322 10323 unsigned SelectedIC = std::max(IC, UserIC); 10324 // Optimistically generate runtime checks if they are needed. Drop them if 10325 // they turn out to not be profitable. 10326 if (VF.Width.isVector() || SelectedIC > 1) 10327 Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC); 10328 10329 // Check if it is profitable to vectorize with runtime checks. 10330 bool ForceVectorization = 10331 Hints.getForce() == LoopVectorizeHints::FK_Enabled; 10332 if (!ForceVectorization && 10333 !areRuntimeChecksProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L, 10334 *PSE.getSE())) { 10335 ORE->emit([&]() { 10336 return OptimizationRemarkAnalysisAliasing( 10337 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(), 10338 L->getHeader()) 10339 << "loop not vectorized: cannot prove it is safe to reorder " 10340 "memory operations"; 10341 }); 10342 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 10343 Hints.emitRemarkWithHints(); 10344 return false; 10345 } 10346 } 10347 10348 // Identify the diagnostic messages that should be produced. 10349 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10350 bool VectorizeLoop = true, InterleaveLoop = true; 10351 if (VF.Width.isScalar()) { 10352 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10353 VecDiagMsg = std::make_pair( 10354 "VectorizationNotBeneficial", 10355 "the cost-model indicates that vectorization is not beneficial"); 10356 VectorizeLoop = false; 10357 } 10358 10359 if (!MaybeVF && UserIC > 1) { 10360 // Tell the user interleaving was avoided up-front, despite being explicitly 10361 // requested. 10362 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10363 "interleaving should be avoided up front\n"); 10364 IntDiagMsg = std::make_pair( 10365 "InterleavingAvoided", 10366 "Ignoring UserIC, because interleaving was avoided up front"); 10367 InterleaveLoop = false; 10368 } else if (IC == 1 && UserIC <= 1) { 10369 // Tell the user interleaving is not beneficial. 10370 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10371 IntDiagMsg = std::make_pair( 10372 "InterleavingNotBeneficial", 10373 "the cost-model indicates that interleaving is not beneficial"); 10374 InterleaveLoop = false; 10375 if (UserIC == 1) { 10376 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10377 IntDiagMsg.second += 10378 " and is explicitly disabled or interleave count is set to 1"; 10379 } 10380 } else if (IC > 1 && UserIC == 1) { 10381 // Tell the user interleaving is beneficial, but it explicitly disabled. 10382 LLVM_DEBUG( 10383 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10384 IntDiagMsg = std::make_pair( 10385 "InterleavingBeneficialButDisabled", 10386 "the cost-model indicates that interleaving is beneficial " 10387 "but is explicitly disabled or interleave count is set to 1"); 10388 InterleaveLoop = false; 10389 } 10390 10391 // Override IC if user provided an interleave count. 10392 IC = UserIC > 0 ? UserIC : IC; 10393 10394 // Emit diagnostic messages, if any. 10395 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10396 if (!VectorizeLoop && !InterleaveLoop) { 10397 // Do not vectorize or interleaving the loop. 10398 ORE->emit([&]() { 10399 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10400 L->getStartLoc(), L->getHeader()) 10401 << VecDiagMsg.second; 10402 }); 10403 ORE->emit([&]() { 10404 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10405 L->getStartLoc(), L->getHeader()) 10406 << IntDiagMsg.second; 10407 }); 10408 return false; 10409 } else if (!VectorizeLoop && InterleaveLoop) { 10410 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10411 ORE->emit([&]() { 10412 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10413 L->getStartLoc(), L->getHeader()) 10414 << VecDiagMsg.second; 10415 }); 10416 } else if (VectorizeLoop && !InterleaveLoop) { 10417 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10418 << ") in " << DebugLocStr << '\n'); 10419 ORE->emit([&]() { 10420 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10421 L->getStartLoc(), L->getHeader()) 10422 << IntDiagMsg.second; 10423 }); 10424 } else if (VectorizeLoop && InterleaveLoop) { 10425 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10426 << ") in " << DebugLocStr << '\n'); 10427 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10428 } 10429 10430 bool DisableRuntimeUnroll = false; 10431 MDNode *OrigLoopID = L->getLoopID(); 10432 { 10433 using namespace ore; 10434 if (!VectorizeLoop) { 10435 assert(IC > 1 && "interleave count should not be 1 or 0"); 10436 // If we decided that it is not legal to vectorize the loop, then 10437 // interleave it. 10438 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10439 &CM, BFI, PSI, Checks); 10440 10441 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10442 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false); 10443 10444 ORE->emit([&]() { 10445 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10446 L->getHeader()) 10447 << "interleaved loop (interleaved count: " 10448 << NV("InterleaveCount", IC) << ")"; 10449 }); 10450 } else { 10451 // If we decided that it is *legal* to vectorize the loop, then do it. 10452 10453 // Consider vectorizing the epilogue too if it's profitable. 10454 VectorizationFactor EpilogueVF = 10455 LVP.selectEpilogueVectorizationFactor(VF.Width, IC); 10456 if (EpilogueVF.Width.isVector()) { 10457 10458 // The first pass vectorizes the main loop and creates a scalar epilogue 10459 // to be vectorized by executing the plan (potentially with a different 10460 // factor) again shortly afterwards. 10461 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); 10462 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10463 EPI, &LVL, &CM, BFI, PSI, Checks); 10464 10465 VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); 10466 auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, 10467 BestMainPlan, MainILV, DT, true); 10468 ++LoopsVectorized; 10469 10470 // Second pass vectorizes the epilogue and adjusts the control flow 10471 // edges from the first pass. 10472 EPI.MainLoopVF = EPI.EpilogueVF; 10473 EPI.MainLoopUF = EPI.EpilogueUF; 10474 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10475 ORE, EPI, &LVL, &CM, BFI, PSI, 10476 Checks); 10477 10478 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); 10479 VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion(); 10480 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock(); 10481 Header->setName("vec.epilog.vector.body"); 10482 10483 // Re-use the trip count and steps expanded for the main loop, as 10484 // skeleton creation needs it as a value that dominates both the scalar 10485 // and vector epilogue loops 10486 // TODO: This is a workaround needed for epilogue vectorization and it 10487 // should be removed once induction resume value creation is done 10488 // directly in VPlan. 10489 EpilogILV.setTripCount(MainILV.getTripCount()); 10490 for (auto &R : make_early_inc_range(*BestEpiPlan.getPreheader())) { 10491 auto *ExpandR = cast<VPExpandSCEVRecipe>(&R); 10492 auto *ExpandedVal = BestEpiPlan.getVPValueOrAddLiveIn( 10493 ExpandedSCEVs.find(ExpandR->getSCEV())->second); 10494 ExpandR->replaceAllUsesWith(ExpandedVal); 10495 ExpandR->eraseFromParent(); 10496 } 10497 10498 // Ensure that the start values for any VPWidenIntOrFpInductionRecipe, 10499 // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated 10500 // before vectorizing the epilogue loop. 10501 for (VPRecipeBase &R : Header->phis()) { 10502 if (isa<VPCanonicalIVPHIRecipe>(&R)) 10503 continue; 10504 10505 Value *ResumeV = nullptr; 10506 // TODO: Move setting of resume values to prepareToExecute. 10507 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) { 10508 ResumeV = MainILV.getReductionResumeValue( 10509 ReductionPhi->getRecurrenceDescriptor()); 10510 } else { 10511 // Create induction resume values for both widened pointer and 10512 // integer/fp inductions and update the start value of the induction 10513 // recipes to use the resume value. 10514 PHINode *IndPhi = nullptr; 10515 const InductionDescriptor *ID; 10516 if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) { 10517 IndPhi = cast<PHINode>(Ind->getUnderlyingValue()); 10518 ID = &Ind->getInductionDescriptor(); 10519 } else { 10520 auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R); 10521 IndPhi = WidenInd->getPHINode(); 10522 ID = &WidenInd->getInductionDescriptor(); 10523 } 10524 10525 ResumeV = MainILV.createInductionResumeValue( 10526 IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs), 10527 {EPI.MainLoopIterationCountCheck}); 10528 } 10529 assert(ResumeV && "Must have a resume value"); 10530 VPValue *StartVal = BestEpiPlan.getVPValueOrAddLiveIn(ResumeV); 10531 cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal); 10532 } 10533 10534 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, 10535 DT, true, &ExpandedSCEVs); 10536 ++LoopsEpilogueVectorized; 10537 10538 if (!MainILV.areSafetyChecksAdded()) 10539 DisableRuntimeUnroll = true; 10540 } else { 10541 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 10542 VF.MinProfitableTripCount, IC, &LVL, &CM, BFI, 10543 PSI, Checks); 10544 10545 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10546 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false); 10547 ++LoopsVectorized; 10548 10549 // Add metadata to disable runtime unrolling a scalar loop when there 10550 // are no runtime checks about strides and memory. A scalar loop that is 10551 // rarely used is not worth unrolling. 10552 if (!LB.areSafetyChecksAdded()) 10553 DisableRuntimeUnroll = true; 10554 } 10555 // Report the vectorization decision. 10556 ORE->emit([&]() { 10557 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 10558 L->getHeader()) 10559 << "vectorized loop (vectorization width: " 10560 << NV("VectorizationFactor", VF.Width) 10561 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 10562 }); 10563 } 10564 10565 if (ORE->allowExtraAnalysis(LV_NAME)) 10566 checkMixedPrecision(L, ORE); 10567 } 10568 10569 std::optional<MDNode *> RemainderLoopID = 10570 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10571 LLVMLoopVectorizeFollowupEpilogue}); 10572 if (RemainderLoopID) { 10573 L->setLoopID(*RemainderLoopID); 10574 } else { 10575 if (DisableRuntimeUnroll) 10576 AddRuntimeUnrollDisableMetaData(L); 10577 10578 // Mark the loop as already vectorized to avoid vectorizing again. 10579 Hints.setAlreadyVectorized(); 10580 } 10581 10582 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10583 return true; 10584 } 10585 10586 LoopVectorizeResult LoopVectorizePass::runImpl( 10587 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10588 DominatorTree &DT_, BlockFrequencyInfo *BFI_, TargetLibraryInfo *TLI_, 10589 DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_, 10590 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10591 SE = &SE_; 10592 LI = &LI_; 10593 TTI = &TTI_; 10594 DT = &DT_; 10595 BFI = BFI_; 10596 TLI = TLI_; 10597 AC = &AC_; 10598 LAIs = &LAIs_; 10599 DB = &DB_; 10600 ORE = &ORE_; 10601 PSI = PSI_; 10602 10603 // Don't attempt if 10604 // 1. the target claims to have no vector registers, and 10605 // 2. interleaving won't help ILP. 10606 // 10607 // The second condition is necessary because, even if the target has no 10608 // vector registers, loop vectorization may still enable scalar 10609 // interleaving. 10610 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10611 TTI->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2) 10612 return LoopVectorizeResult(false, false); 10613 10614 bool Changed = false, CFGChanged = false; 10615 10616 // The vectorizer requires loops to be in simplified form. 10617 // Since simplification may add new inner loops, it has to run before the 10618 // legality and profitability checks. This means running the loop vectorizer 10619 // will simplify all loops, regardless of whether anything end up being 10620 // vectorized. 10621 for (const auto &L : *LI) 10622 Changed |= CFGChanged |= 10623 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10624 10625 // Build up a worklist of inner-loops to vectorize. This is necessary as 10626 // the act of vectorizing or partially unrolling a loop creates new loops 10627 // and can invalidate iterators across the loops. 10628 SmallVector<Loop *, 8> Worklist; 10629 10630 for (Loop *L : *LI) 10631 collectSupportedLoops(*L, LI, ORE, Worklist); 10632 10633 LoopsAnalyzed += Worklist.size(); 10634 10635 // Now walk the identified inner loops. 10636 while (!Worklist.empty()) { 10637 Loop *L = Worklist.pop_back_val(); 10638 10639 // For the inner loops we actually process, form LCSSA to simplify the 10640 // transform. 10641 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10642 10643 Changed |= CFGChanged |= processLoop(L); 10644 10645 if (Changed) 10646 LAIs->clear(); 10647 } 10648 10649 // Process each loop nest in the function. 10650 return LoopVectorizeResult(Changed, CFGChanged); 10651 } 10652 10653 PreservedAnalyses LoopVectorizePass::run(Function &F, 10654 FunctionAnalysisManager &AM) { 10655 auto &LI = AM.getResult<LoopAnalysis>(F); 10656 // There are no loops in the function. Return before computing other expensive 10657 // analyses. 10658 if (LI.empty()) 10659 return PreservedAnalyses::all(); 10660 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10661 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10662 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10663 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10664 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10665 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10666 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10667 10668 LoopAccessInfoManager &LAIs = AM.getResult<LoopAccessAnalysis>(F); 10669 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10670 ProfileSummaryInfo *PSI = 10671 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10672 BlockFrequencyInfo *BFI = nullptr; 10673 if (PSI && PSI->hasProfileSummary()) 10674 BFI = &AM.getResult<BlockFrequencyAnalysis>(F); 10675 LoopVectorizeResult Result = 10676 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AC, LAIs, ORE, PSI); 10677 if (!Result.MadeAnyChange) 10678 return PreservedAnalyses::all(); 10679 PreservedAnalyses PA; 10680 10681 if (isAssignmentTrackingEnabled(*F.getParent())) { 10682 for (auto &BB : F) 10683 RemoveRedundantDbgInstrs(&BB); 10684 } 10685 10686 // We currently do not preserve loopinfo/dominator analyses with outer loop 10687 // vectorization. Until this is addressed, mark these analyses as preserved 10688 // only for non-VPlan-native path. 10689 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10690 if (!EnableVPlanNativePath) { 10691 PA.preserve<LoopAnalysis>(); 10692 PA.preserve<DominatorTreeAnalysis>(); 10693 PA.preserve<ScalarEvolutionAnalysis>(); 10694 10695 #ifdef EXPENSIVE_CHECKS 10696 SE.verify(); 10697 #endif 10698 } 10699 10700 if (Result.MadeCFGChange) { 10701 // Making CFG changes likely means a loop got vectorized. Indicate that 10702 // extra simplification passes should be run. 10703 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only 10704 // be run if runtime checks have been added. 10705 AM.getResult<ShouldRunExtraVectorPasses>(F); 10706 PA.preserve<ShouldRunExtraVectorPasses>(); 10707 } else { 10708 PA.preserveSet<CFGAnalyses>(); 10709 } 10710 return PA; 10711 } 10712 10713 void LoopVectorizePass::printPipeline( 10714 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 10715 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( 10716 OS, MapClassName2PassName); 10717 10718 OS << '<'; 10719 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;"; 10720 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;"; 10721 OS << '>'; 10722 } 10723