1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanTransforms.h" 62 #include "llvm/ADT/APInt.h" 63 #include "llvm/ADT/ArrayRef.h" 64 #include "llvm/ADT/DenseMap.h" 65 #include "llvm/ADT/DenseMapInfo.h" 66 #include "llvm/ADT/Hashing.h" 67 #include "llvm/ADT/MapVector.h" 68 #include "llvm/ADT/STLExtras.h" 69 #include "llvm/ADT/SmallPtrSet.h" 70 #include "llvm/ADT/SmallSet.h" 71 #include "llvm/ADT/SmallVector.h" 72 #include "llvm/ADT/Statistic.h" 73 #include "llvm/ADT/StringRef.h" 74 #include "llvm/ADT/Twine.h" 75 #include "llvm/ADT/iterator_range.h" 76 #include "llvm/Analysis/AssumptionCache.h" 77 #include "llvm/Analysis/BasicAliasAnalysis.h" 78 #include "llvm/Analysis/BlockFrequencyInfo.h" 79 #include "llvm/Analysis/CFG.h" 80 #include "llvm/Analysis/CodeMetrics.h" 81 #include "llvm/Analysis/DemandedBits.h" 82 #include "llvm/Analysis/GlobalsModRef.h" 83 #include "llvm/Analysis/LoopAccessAnalysis.h" 84 #include "llvm/Analysis/LoopAnalysisManager.h" 85 #include "llvm/Analysis/LoopInfo.h" 86 #include "llvm/Analysis/LoopIterator.h" 87 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 88 #include "llvm/Analysis/ProfileSummaryInfo.h" 89 #include "llvm/Analysis/ScalarEvolution.h" 90 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 91 #include "llvm/Analysis/TargetLibraryInfo.h" 92 #include "llvm/Analysis/TargetTransformInfo.h" 93 #include "llvm/Analysis/ValueTracking.h" 94 #include "llvm/Analysis/VectorUtils.h" 95 #include "llvm/IR/Attributes.h" 96 #include "llvm/IR/BasicBlock.h" 97 #include "llvm/IR/CFG.h" 98 #include "llvm/IR/Constant.h" 99 #include "llvm/IR/Constants.h" 100 #include "llvm/IR/DataLayout.h" 101 #include "llvm/IR/DebugInfoMetadata.h" 102 #include "llvm/IR/DebugLoc.h" 103 #include "llvm/IR/DerivedTypes.h" 104 #include "llvm/IR/DiagnosticInfo.h" 105 #include "llvm/IR/Dominators.h" 106 #include "llvm/IR/Function.h" 107 #include "llvm/IR/IRBuilder.h" 108 #include "llvm/IR/InstrTypes.h" 109 #include "llvm/IR/Instruction.h" 110 #include "llvm/IR/Instructions.h" 111 #include "llvm/IR/IntrinsicInst.h" 112 #include "llvm/IR/Intrinsics.h" 113 #include "llvm/IR/Metadata.h" 114 #include "llvm/IR/Module.h" 115 #include "llvm/IR/Operator.h" 116 #include "llvm/IR/PatternMatch.h" 117 #include "llvm/IR/Type.h" 118 #include "llvm/IR/Use.h" 119 #include "llvm/IR/User.h" 120 #include "llvm/IR/Value.h" 121 #include "llvm/IR/ValueHandle.h" 122 #include "llvm/IR/Verifier.h" 123 #include "llvm/InitializePasses.h" 124 #include "llvm/Pass.h" 125 #include "llvm/Support/Casting.h" 126 #include "llvm/Support/CommandLine.h" 127 #include "llvm/Support/Compiler.h" 128 #include "llvm/Support/Debug.h" 129 #include "llvm/Support/ErrorHandling.h" 130 #include "llvm/Support/InstructionCost.h" 131 #include "llvm/Support/MathExtras.h" 132 #include "llvm/Support/raw_ostream.h" 133 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 134 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 135 #include "llvm/Transforms/Utils/LoopSimplify.h" 136 #include "llvm/Transforms/Utils/LoopUtils.h" 137 #include "llvm/Transforms/Utils/LoopVersioning.h" 138 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 139 #include "llvm/Transforms/Utils/SizeOpts.h" 140 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 141 #include <algorithm> 142 #include <cassert> 143 #include <cmath> 144 #include <cstdint> 145 #include <functional> 146 #include <iterator> 147 #include <limits> 148 #include <map> 149 #include <memory> 150 #include <string> 151 #include <tuple> 152 #include <utility> 153 154 using namespace llvm; 155 156 #define LV_NAME "loop-vectorize" 157 #define DEBUG_TYPE LV_NAME 158 159 #ifndef NDEBUG 160 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 161 #endif 162 163 /// @{ 164 /// Metadata attribute names 165 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 166 const char LLVMLoopVectorizeFollowupVectorized[] = 167 "llvm.loop.vectorize.followup_vectorized"; 168 const char LLVMLoopVectorizeFollowupEpilogue[] = 169 "llvm.loop.vectorize.followup_epilogue"; 170 /// @} 171 172 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 173 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 174 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 175 176 static cl::opt<bool> EnableEpilogueVectorization( 177 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 178 cl::desc("Enable vectorization of epilogue loops.")); 179 180 static cl::opt<unsigned> EpilogueVectorizationForceVF( 181 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 182 cl::desc("When epilogue vectorization is enabled, and a value greater than " 183 "1 is specified, forces the given VF for all applicable epilogue " 184 "loops.")); 185 186 static cl::opt<unsigned> EpilogueVectorizationMinVF( 187 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 188 cl::desc("Only loops with vectorization factor equal to or larger than " 189 "the specified value are considered for epilogue vectorization.")); 190 191 /// Loops with a known constant trip count below this number are vectorized only 192 /// if no scalar iteration overheads are incurred. 193 static cl::opt<unsigned> TinyTripCountVectorThreshold( 194 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 195 cl::desc("Loops with a constant trip count that is smaller than this " 196 "value are vectorized only if no scalar iteration overheads " 197 "are incurred.")); 198 199 static cl::opt<unsigned> VectorizeMemoryCheckThreshold( 200 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 201 cl::desc("The maximum allowed number of runtime memory checks")); 202 203 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 204 // that predication is preferred, and this lists all options. I.e., the 205 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 206 // and predicate the instructions accordingly. If tail-folding fails, there are 207 // different fallback strategies depending on these values: 208 namespace PreferPredicateTy { 209 enum Option { 210 ScalarEpilogue = 0, 211 PredicateElseScalarEpilogue, 212 PredicateOrDontVectorize 213 }; 214 } // namespace PreferPredicateTy 215 216 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 217 "prefer-predicate-over-epilogue", 218 cl::init(PreferPredicateTy::ScalarEpilogue), 219 cl::Hidden, 220 cl::desc("Tail-folding and predication preferences over creating a scalar " 221 "epilogue loop."), 222 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 223 "scalar-epilogue", 224 "Don't tail-predicate loops, create scalar epilogue"), 225 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 226 "predicate-else-scalar-epilogue", 227 "prefer tail-folding, create scalar epilogue if tail " 228 "folding fails."), 229 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 230 "predicate-dont-vectorize", 231 "prefers tail-folding, don't attempt vectorization if " 232 "tail-folding fails."))); 233 234 static cl::opt<bool> MaximizeBandwidth( 235 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 236 cl::desc("Maximize bandwidth when selecting vectorization factor which " 237 "will be determined by the smallest type in loop.")); 238 239 static cl::opt<bool> EnableInterleavedMemAccesses( 240 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 241 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 242 243 /// An interleave-group may need masking if it resides in a block that needs 244 /// predication, or in order to mask away gaps. 245 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 246 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 247 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 248 249 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 250 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 251 cl::desc("We don't interleave loops with a estimated constant trip count " 252 "below this number")); 253 254 static cl::opt<unsigned> ForceTargetNumScalarRegs( 255 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 256 cl::desc("A flag that overrides the target's number of scalar registers.")); 257 258 static cl::opt<unsigned> ForceTargetNumVectorRegs( 259 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 260 cl::desc("A flag that overrides the target's number of vector registers.")); 261 262 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 263 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 264 cl::desc("A flag that overrides the target's max interleave factor for " 265 "scalar loops.")); 266 267 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 268 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 269 cl::desc("A flag that overrides the target's max interleave factor for " 270 "vectorized loops.")); 271 272 static cl::opt<unsigned> ForceTargetInstructionCost( 273 "force-target-instruction-cost", cl::init(0), cl::Hidden, 274 cl::desc("A flag that overrides the target's expected cost for " 275 "an instruction to a single constant value. Mostly " 276 "useful for getting consistent testing.")); 277 278 static cl::opt<bool> ForceTargetSupportsScalableVectors( 279 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 280 cl::desc( 281 "Pretend that scalable vectors are supported, even if the target does " 282 "not support them. This flag should only be used for testing.")); 283 284 static cl::opt<unsigned> SmallLoopCost( 285 "small-loop-cost", cl::init(20), cl::Hidden, 286 cl::desc( 287 "The cost of a loop that is considered 'small' by the interleaver.")); 288 289 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 290 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 291 cl::desc("Enable the use of the block frequency analysis to access PGO " 292 "heuristics minimizing code growth in cold regions and being more " 293 "aggressive in hot regions.")); 294 295 // Runtime interleave loops for load/store throughput. 296 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 297 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 298 cl::desc( 299 "Enable runtime interleaving until load/store ports are saturated")); 300 301 /// Interleave small loops with scalar reductions. 302 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 303 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 304 cl::desc("Enable interleaving for loops with small iteration counts that " 305 "contain scalar reductions to expose ILP.")); 306 307 /// The number of stores in a loop that are allowed to need predication. 308 static cl::opt<unsigned> NumberOfStoresToPredicate( 309 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 310 cl::desc("Max number of stores to be predicated behind an if.")); 311 312 static cl::opt<bool> EnableIndVarRegisterHeur( 313 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 314 cl::desc("Count the induction variable only once when interleaving")); 315 316 static cl::opt<bool> EnableCondStoresVectorization( 317 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 318 cl::desc("Enable if predication of stores during vectorization.")); 319 320 static cl::opt<unsigned> MaxNestedScalarReductionIC( 321 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 322 cl::desc("The maximum interleave count to use when interleaving a scalar " 323 "reduction in a nested loop.")); 324 325 static cl::opt<bool> 326 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 327 cl::Hidden, 328 cl::desc("Prefer in-loop vector reductions, " 329 "overriding the targets preference.")); 330 331 static cl::opt<bool> ForceOrderedReductions( 332 "force-ordered-reductions", cl::init(false), cl::Hidden, 333 cl::desc("Enable the vectorisation of loops with in-order (strict) " 334 "FP reductions")); 335 336 static cl::opt<bool> PreferPredicatedReductionSelect( 337 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 338 cl::desc( 339 "Prefer predicating a reduction operation over an after loop select.")); 340 341 cl::opt<bool> EnableVPlanNativePath( 342 "enable-vplan-native-path", cl::init(false), cl::Hidden, 343 cl::desc("Enable VPlan-native vectorization path with " 344 "support for outer loop vectorization.")); 345 346 // This flag enables the stress testing of the VPlan H-CFG construction in the 347 // VPlan-native vectorization path. It must be used in conjuction with 348 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 349 // verification of the H-CFGs built. 350 static cl::opt<bool> VPlanBuildStressTest( 351 "vplan-build-stress-test", cl::init(false), cl::Hidden, 352 cl::desc( 353 "Build VPlan for every supported loop nest in the function and bail " 354 "out right after the build (stress test the VPlan H-CFG construction " 355 "in the VPlan-native vectorization path).")); 356 357 cl::opt<bool> llvm::EnableLoopInterleaving( 358 "interleave-loops", cl::init(true), cl::Hidden, 359 cl::desc("Enable loop interleaving in Loop vectorization passes")); 360 cl::opt<bool> llvm::EnableLoopVectorization( 361 "vectorize-loops", cl::init(true), cl::Hidden, 362 cl::desc("Run the Loop vectorization passes")); 363 364 static cl::opt<bool> PrintVPlansInDotFormat( 365 "vplan-print-in-dot-format", cl::Hidden, 366 cl::desc("Use dot format instead of plain text when dumping VPlans")); 367 368 static cl::opt<cl::boolOrDefault> ForceSafeDivisor( 369 "force-widen-divrem-via-safe-divisor", cl::Hidden, 370 cl::desc( 371 "Override cost based safe divisor widening for div/rem instructions")); 372 373 /// A helper function that returns true if the given type is irregular. The 374 /// type is irregular if its allocated size doesn't equal the store size of an 375 /// element of the corresponding vector type. 376 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 377 // Determine if an array of N elements of type Ty is "bitcast compatible" 378 // with a <N x Ty> vector. 379 // This is only true if there is no padding between the array elements. 380 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 381 } 382 383 /// A helper function that returns the reciprocal of the block probability of 384 /// predicated blocks. If we return X, we are assuming the predicated block 385 /// will execute once for every X iterations of the loop header. 386 /// 387 /// TODO: We should use actual block probability here, if available. Currently, 388 /// we always assume predicated blocks have a 50% chance of executing. 389 static unsigned getReciprocalPredBlockProb() { return 2; } 390 391 /// A helper function that returns an integer or floating-point constant with 392 /// value C. 393 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 394 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 395 : ConstantFP::get(Ty, C); 396 } 397 398 /// Returns "best known" trip count for the specified loop \p L as defined by 399 /// the following procedure: 400 /// 1) Returns exact trip count if it is known. 401 /// 2) Returns expected trip count according to profile data if any. 402 /// 3) Returns upper bound estimate if it is known. 403 /// 4) Returns std::nullopt if all of the above failed. 404 static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, 405 Loop *L) { 406 // Check if exact trip count is known. 407 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 408 return ExpectedTC; 409 410 // Check if there is an expected trip count available from profile data. 411 if (LoopVectorizeWithBlockFrequency) 412 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 413 return *EstimatedTC; 414 415 // Check if upper bound estimate is known. 416 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 417 return ExpectedTC; 418 419 return std::nullopt; 420 } 421 422 namespace { 423 // Forward declare GeneratedRTChecks. 424 class GeneratedRTChecks; 425 } // namespace 426 427 namespace llvm { 428 429 AnalysisKey ShouldRunExtraVectorPasses::Key; 430 431 /// InnerLoopVectorizer vectorizes loops which contain only one basic 432 /// block to a specified vectorization factor (VF). 433 /// This class performs the widening of scalars into vectors, or multiple 434 /// scalars. This class also implements the following features: 435 /// * It inserts an epilogue loop for handling loops that don't have iteration 436 /// counts that are known to be a multiple of the vectorization factor. 437 /// * It handles the code generation for reduction variables. 438 /// * Scalarization (implementation using scalars) of un-vectorizable 439 /// instructions. 440 /// InnerLoopVectorizer does not perform any vectorization-legality 441 /// checks, and relies on the caller to check for the different legality 442 /// aspects. The InnerLoopVectorizer relies on the 443 /// LoopVectorizationLegality class to provide information about the induction 444 /// and reduction variables that were found to a given vectorization factor. 445 class InnerLoopVectorizer { 446 public: 447 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 448 LoopInfo *LI, DominatorTree *DT, 449 const TargetLibraryInfo *TLI, 450 const TargetTransformInfo *TTI, AssumptionCache *AC, 451 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 452 ElementCount MinProfitableTripCount, 453 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 454 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 455 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 456 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 457 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 458 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 459 PSI(PSI), RTChecks(RTChecks) { 460 // Query this against the original loop and save it here because the profile 461 // of the original loop header may change as the transformation happens. 462 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 463 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 464 465 if (MinProfitableTripCount.isZero()) 466 this->MinProfitableTripCount = VecWidth; 467 else 468 this->MinProfitableTripCount = MinProfitableTripCount; 469 } 470 471 virtual ~InnerLoopVectorizer() = default; 472 473 /// Create a new empty loop that will contain vectorized instructions later 474 /// on, while the old loop will be used as the scalar remainder. Control flow 475 /// is generated around the vectorized (and scalar epilogue) loops consisting 476 /// of various checks and bypasses. Return the pre-header block of the new 477 /// loop and the start value for the canonical induction, if it is != 0. The 478 /// latter is the case when vectorizing the epilogue loop. In the case of 479 /// epilogue vectorization, this function is overriden to handle the more 480 /// complex control flow around the loops. 481 virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(); 482 483 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 484 void fixVectorizedLoop(VPTransformState &State, VPlan &Plan); 485 486 // Return true if any runtime check is added. 487 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 488 489 /// A type for vectorized values in the new loop. Each value from the 490 /// original loop, when vectorized, is represented by UF vector values in the 491 /// new unrolled loop, where UF is the unroll factor. 492 using VectorParts = SmallVector<Value *, 2>; 493 494 /// A helper function to scalarize a single Instruction in the innermost loop. 495 /// Generates a sequence of scalar instances for each lane between \p MinLane 496 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 497 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p 498 /// Instr's operands. 499 void scalarizeInstruction(const Instruction *Instr, 500 VPReplicateRecipe *RepRecipe, 501 const VPIteration &Instance, bool IfPredicateInstr, 502 VPTransformState &State); 503 504 /// Construct the vector value of a scalarized value \p V one lane at a time. 505 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 506 VPTransformState &State); 507 508 /// Try to vectorize interleaved access group \p Group with the base address 509 /// given in \p Addr, optionally masking the vector operations if \p 510 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 511 /// values in the vectorized loop. 512 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 513 ArrayRef<VPValue *> VPDefs, 514 VPTransformState &State, VPValue *Addr, 515 ArrayRef<VPValue *> StoredValues, 516 VPValue *BlockInMask = nullptr); 517 518 /// Fix the non-induction PHIs in \p Plan. 519 void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State); 520 521 /// Returns true if the reordering of FP operations is not allowed, but we are 522 /// able to vectorize with strict in-order reductions for the given RdxDesc. 523 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc); 524 525 /// Create a broadcast instruction. This method generates a broadcast 526 /// instruction (shuffle) for loop invariant values and for the induction 527 /// value. If this is the induction variable then we extend it to N, N+1, ... 528 /// this is needed because each iteration in the loop corresponds to a SIMD 529 /// element. 530 virtual Value *getBroadcastInstrs(Value *V); 531 532 // Returns the resume value (bc.merge.rdx) for a reduction as 533 // generated by fixReduction. 534 PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc); 535 536 /// Create a new phi node for the induction variable \p OrigPhi to resume 537 /// iteration count in the scalar epilogue, from where the vectorized loop 538 /// left off. In cases where the loop skeleton is more complicated (eg. 539 /// epilogue vectorization) and the resume values can come from an additional 540 /// bypass block, the \p AdditionalBypass pair provides information about the 541 /// bypass block and the end value on the edge from bypass to this loop. 542 PHINode *createInductionResumeValue( 543 PHINode *OrigPhi, const InductionDescriptor &ID, 544 ArrayRef<BasicBlock *> BypassBlocks, 545 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 546 547 protected: 548 friend class LoopVectorizationPlanner; 549 550 /// A small list of PHINodes. 551 using PhiVector = SmallVector<PHINode *, 4>; 552 553 /// A type for scalarized values in the new loop. Each value from the 554 /// original loop, when scalarized, is represented by UF x VF scalar values 555 /// in the new unrolled loop, where UF is the unroll factor and VF is the 556 /// vectorization factor. 557 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 558 559 /// Set up the values of the IVs correctly when exiting the vector loop. 560 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 561 Value *VectorTripCount, Value *EndValue, 562 BasicBlock *MiddleBlock, BasicBlock *VectorHeader, 563 VPlan &Plan); 564 565 /// Handle all cross-iteration phis in the header. 566 void fixCrossIterationPHIs(VPTransformState &State); 567 568 /// Create the exit value of first order recurrences in the middle block and 569 /// update their users. 570 void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR, 571 VPTransformState &State); 572 573 /// Create code for the loop exit value of the reduction. 574 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); 575 576 /// Clear NSW/NUW flags from reduction instructions if necessary. 577 void clearReductionWrapFlags(VPReductionPHIRecipe *PhiR, 578 VPTransformState &State); 579 580 /// Iteratively sink the scalarized operands of a predicated instruction into 581 /// the block that was created for it. 582 void sinkScalarOperands(Instruction *PredInst); 583 584 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 585 /// represented as. 586 void truncateToMinimalBitwidths(VPTransformState &State); 587 588 /// Returns (and creates if needed) the original loop trip count. 589 Value *getOrCreateTripCount(BasicBlock *InsertBlock); 590 591 /// Returns (and creates if needed) the trip count of the widened loop. 592 Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock); 593 594 /// Returns a bitcasted value to the requested vector type. 595 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 596 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 597 const DataLayout &DL); 598 599 /// Emit a bypass check to see if the vector trip count is zero, including if 600 /// it overflows. 601 void emitIterationCountCheck(BasicBlock *Bypass); 602 603 /// Emit a bypass check to see if all of the SCEV assumptions we've 604 /// had to make are correct. Returns the block containing the checks or 605 /// nullptr if no checks have been added. 606 BasicBlock *emitSCEVChecks(BasicBlock *Bypass); 607 608 /// Emit bypass checks to check any memory assumptions we may have made. 609 /// Returns the block containing the checks or nullptr if no checks have been 610 /// added. 611 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass); 612 613 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 614 /// vector loop preheader, middle block and scalar preheader. 615 void createVectorLoopSkeleton(StringRef Prefix); 616 617 /// Create new phi nodes for the induction variables to resume iteration count 618 /// in the scalar epilogue, from where the vectorized loop left off. 619 /// In cases where the loop skeleton is more complicated (eg. epilogue 620 /// vectorization) and the resume values can come from an additional bypass 621 /// block, the \p AdditionalBypass pair provides information about the bypass 622 /// block and the end value on the edge from bypass to this loop. 623 void createInductionResumeValues( 624 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 625 626 /// Complete the loop skeleton by adding debug MDs, creating appropriate 627 /// conditional branches in the middle block, preparing the builder and 628 /// running the verifier. Return the preheader of the completed vector loop. 629 BasicBlock *completeLoopSkeleton(); 630 631 /// Collect poison-generating recipes that may generate a poison value that is 632 /// used after vectorization, even when their operands are not poison. Those 633 /// recipes meet the following conditions: 634 /// * Contribute to the address computation of a recipe generating a widen 635 /// memory load/store (VPWidenMemoryInstructionRecipe or 636 /// VPInterleaveRecipe). 637 /// * Such a widen memory load/store has at least one underlying Instruction 638 /// that is in a basic block that needs predication and after vectorization 639 /// the generated instruction won't be predicated. 640 void collectPoisonGeneratingRecipes(VPTransformState &State); 641 642 /// Allow subclasses to override and print debug traces before/after vplan 643 /// execution, when trace information is requested. 644 virtual void printDebugTracesAtStart(){}; 645 virtual void printDebugTracesAtEnd(){}; 646 647 /// The original loop. 648 Loop *OrigLoop; 649 650 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 651 /// dynamic knowledge to simplify SCEV expressions and converts them to a 652 /// more usable form. 653 PredicatedScalarEvolution &PSE; 654 655 /// Loop Info. 656 LoopInfo *LI; 657 658 /// Dominator Tree. 659 DominatorTree *DT; 660 661 /// Target Library Info. 662 const TargetLibraryInfo *TLI; 663 664 /// Target Transform Info. 665 const TargetTransformInfo *TTI; 666 667 /// Assumption Cache. 668 AssumptionCache *AC; 669 670 /// Interface to emit optimization remarks. 671 OptimizationRemarkEmitter *ORE; 672 673 /// The vectorization SIMD factor to use. Each vector will have this many 674 /// vector elements. 675 ElementCount VF; 676 677 ElementCount MinProfitableTripCount; 678 679 /// The vectorization unroll factor to use. Each scalar is vectorized to this 680 /// many different vector instructions. 681 unsigned UF; 682 683 /// The builder that we use 684 IRBuilder<> Builder; 685 686 // --- Vectorization state --- 687 688 /// The vector-loop preheader. 689 BasicBlock *LoopVectorPreHeader; 690 691 /// The scalar-loop preheader. 692 BasicBlock *LoopScalarPreHeader; 693 694 /// Middle Block between the vector and the scalar. 695 BasicBlock *LoopMiddleBlock; 696 697 /// The unique ExitBlock of the scalar loop if one exists. Note that 698 /// there can be multiple exiting edges reaching this block. 699 BasicBlock *LoopExitBlock; 700 701 /// The scalar loop body. 702 BasicBlock *LoopScalarBody; 703 704 /// A list of all bypass blocks. The first block is the entry of the loop. 705 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 706 707 /// Store instructions that were predicated. 708 SmallVector<Instruction *, 4> PredicatedInstructions; 709 710 /// Trip count of the original loop. 711 Value *TripCount = nullptr; 712 713 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 714 Value *VectorTripCount = nullptr; 715 716 /// The legality analysis. 717 LoopVectorizationLegality *Legal; 718 719 /// The profitablity analysis. 720 LoopVectorizationCostModel *Cost; 721 722 // Record whether runtime checks are added. 723 bool AddedSafetyChecks = false; 724 725 // Holds the end values for each induction variable. We save the end values 726 // so we can later fix-up the external users of the induction variables. 727 DenseMap<PHINode *, Value *> IVEndValues; 728 729 /// BFI and PSI are used to check for profile guided size optimizations. 730 BlockFrequencyInfo *BFI; 731 ProfileSummaryInfo *PSI; 732 733 // Whether this loop should be optimized for size based on profile guided size 734 // optimizatios. 735 bool OptForSizeBasedOnProfile; 736 737 /// Structure to hold information about generated runtime checks, responsible 738 /// for cleaning the checks, if vectorization turns out unprofitable. 739 GeneratedRTChecks &RTChecks; 740 741 // Holds the resume values for reductions in the loops, used to set the 742 // correct start value of reduction PHIs when vectorizing the epilogue. 743 SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4> 744 ReductionResumeValues; 745 }; 746 747 class InnerLoopUnroller : public InnerLoopVectorizer { 748 public: 749 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 750 LoopInfo *LI, DominatorTree *DT, 751 const TargetLibraryInfo *TLI, 752 const TargetTransformInfo *TTI, AssumptionCache *AC, 753 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 754 LoopVectorizationLegality *LVL, 755 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 756 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 757 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 758 ElementCount::getFixed(1), 759 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 760 BFI, PSI, Check) {} 761 762 private: 763 Value *getBroadcastInstrs(Value *V) override; 764 }; 765 766 /// Encapsulate information regarding vectorization of a loop and its epilogue. 767 /// This information is meant to be updated and used across two stages of 768 /// epilogue vectorization. 769 struct EpilogueLoopVectorizationInfo { 770 ElementCount MainLoopVF = ElementCount::getFixed(0); 771 unsigned MainLoopUF = 0; 772 ElementCount EpilogueVF = ElementCount::getFixed(0); 773 unsigned EpilogueUF = 0; 774 BasicBlock *MainLoopIterationCountCheck = nullptr; 775 BasicBlock *EpilogueIterationCountCheck = nullptr; 776 BasicBlock *SCEVSafetyCheck = nullptr; 777 BasicBlock *MemSafetyCheck = nullptr; 778 Value *TripCount = nullptr; 779 Value *VectorTripCount = nullptr; 780 781 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, 782 ElementCount EVF, unsigned EUF) 783 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) { 784 assert(EUF == 1 && 785 "A high UF for the epilogue loop is likely not beneficial."); 786 } 787 }; 788 789 /// An extension of the inner loop vectorizer that creates a skeleton for a 790 /// vectorized loop that has its epilogue (residual) also vectorized. 791 /// The idea is to run the vplan on a given loop twice, firstly to setup the 792 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 793 /// from the first step and vectorize the epilogue. This is achieved by 794 /// deriving two concrete strategy classes from this base class and invoking 795 /// them in succession from the loop vectorizer planner. 796 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 797 public: 798 InnerLoopAndEpilogueVectorizer( 799 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 800 DominatorTree *DT, const TargetLibraryInfo *TLI, 801 const TargetTransformInfo *TTI, AssumptionCache *AC, 802 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 803 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 804 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 805 GeneratedRTChecks &Checks) 806 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 807 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL, 808 CM, BFI, PSI, Checks), 809 EPI(EPI) {} 810 811 // Override this function to handle the more complex control flow around the 812 // three loops. 813 std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton() final { 814 return createEpilogueVectorizedLoopSkeleton(); 815 } 816 817 /// The interface for creating a vectorized skeleton using one of two 818 /// different strategies, each corresponding to one execution of the vplan 819 /// as described above. 820 virtual std::pair<BasicBlock *, Value *> 821 createEpilogueVectorizedLoopSkeleton() = 0; 822 823 /// Holds and updates state information required to vectorize the main loop 824 /// and its epilogue in two separate passes. This setup helps us avoid 825 /// regenerating and recomputing runtime safety checks. It also helps us to 826 /// shorten the iteration-count-check path length for the cases where the 827 /// iteration count of the loop is so small that the main vector loop is 828 /// completely skipped. 829 EpilogueLoopVectorizationInfo &EPI; 830 }; 831 832 /// A specialized derived class of inner loop vectorizer that performs 833 /// vectorization of *main* loops in the process of vectorizing loops and their 834 /// epilogues. 835 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 836 public: 837 EpilogueVectorizerMainLoop( 838 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 839 DominatorTree *DT, const TargetLibraryInfo *TLI, 840 const TargetTransformInfo *TTI, AssumptionCache *AC, 841 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 842 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 843 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 844 GeneratedRTChecks &Check) 845 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 846 EPI, LVL, CM, BFI, PSI, Check) {} 847 /// Implements the interface for creating a vectorized skeleton using the 848 /// *main loop* strategy (ie the first pass of vplan execution). 849 std::pair<BasicBlock *, Value *> createEpilogueVectorizedLoopSkeleton() final; 850 851 protected: 852 /// Emits an iteration count bypass check once for the main loop (when \p 853 /// ForEpilogue is false) and once for the epilogue loop (when \p 854 /// ForEpilogue is true). 855 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue); 856 void printDebugTracesAtStart() override; 857 void printDebugTracesAtEnd() override; 858 }; 859 860 // A specialized derived class of inner loop vectorizer that performs 861 // vectorization of *epilogue* loops in the process of vectorizing loops and 862 // their epilogues. 863 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 864 public: 865 EpilogueVectorizerEpilogueLoop( 866 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 867 DominatorTree *DT, const TargetLibraryInfo *TLI, 868 const TargetTransformInfo *TTI, AssumptionCache *AC, 869 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 870 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 871 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 872 GeneratedRTChecks &Checks) 873 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 874 EPI, LVL, CM, BFI, PSI, Checks) { 875 TripCount = EPI.TripCount; 876 } 877 /// Implements the interface for creating a vectorized skeleton using the 878 /// *epilogue loop* strategy (ie the second pass of vplan execution). 879 std::pair<BasicBlock *, Value *> createEpilogueVectorizedLoopSkeleton() final; 880 881 protected: 882 /// Emits an iteration count bypass check after the main vector loop has 883 /// finished to see if there are any iterations left to execute by either 884 /// the vector epilogue or the scalar epilogue. 885 BasicBlock *emitMinimumVectorEpilogueIterCountCheck( 886 BasicBlock *Bypass, 887 BasicBlock *Insert); 888 void printDebugTracesAtStart() override; 889 void printDebugTracesAtEnd() override; 890 }; 891 } // end namespace llvm 892 893 /// Look for a meaningful debug location on the instruction or it's 894 /// operands. 895 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 896 if (!I) 897 return I; 898 899 DebugLoc Empty; 900 if (I->getDebugLoc() != Empty) 901 return I; 902 903 for (Use &Op : I->operands()) { 904 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 905 if (OpInst->getDebugLoc() != Empty) 906 return OpInst; 907 } 908 909 return I; 910 } 911 912 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 913 /// is passed, the message relates to that particular instruction. 914 #ifndef NDEBUG 915 static void debugVectorizationMessage(const StringRef Prefix, 916 const StringRef DebugMsg, 917 Instruction *I) { 918 dbgs() << "LV: " << Prefix << DebugMsg; 919 if (I != nullptr) 920 dbgs() << " " << *I; 921 else 922 dbgs() << '.'; 923 dbgs() << '\n'; 924 } 925 #endif 926 927 /// Create an analysis remark that explains why vectorization failed 928 /// 929 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 930 /// RemarkName is the identifier for the remark. If \p I is passed it is an 931 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 932 /// the location of the remark. \return the remark object that can be 933 /// streamed to. 934 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 935 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 936 Value *CodeRegion = TheLoop->getHeader(); 937 DebugLoc DL = TheLoop->getStartLoc(); 938 939 if (I) { 940 CodeRegion = I->getParent(); 941 // If there is no debug location attached to the instruction, revert back to 942 // using the loop's. 943 if (I->getDebugLoc()) 944 DL = I->getDebugLoc(); 945 } 946 947 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 948 } 949 950 namespace llvm { 951 952 /// Return a value for Step multiplied by VF. 953 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, 954 int64_t Step) { 955 assert(Ty->isIntegerTy() && "Expected an integer step"); 956 Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue()); 957 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 958 } 959 960 /// Return the runtime value for VF. 961 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) { 962 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 963 return VF.isScalable() ? B.CreateVScale(EC) : EC; 964 } 965 966 const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE) { 967 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 968 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count"); 969 970 ScalarEvolution &SE = *PSE.getSE(); 971 972 // The exit count might have the type of i64 while the phi is i32. This can 973 // happen if we have an induction variable that is sign extended before the 974 // compare. The only way that we get a backedge taken count is that the 975 // induction variable was signed and as such will not overflow. In such a case 976 // truncation is legal. 977 if (SE.getTypeSizeInBits(BackedgeTakenCount->getType()) > 978 IdxTy->getPrimitiveSizeInBits()) 979 BackedgeTakenCount = SE.getTruncateOrNoop(BackedgeTakenCount, IdxTy); 980 BackedgeTakenCount = SE.getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 981 982 // Get the total trip count from the count by adding 1. 983 return SE.getAddExpr(BackedgeTakenCount, 984 SE.getOne(BackedgeTakenCount->getType())); 985 } 986 987 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy, 988 ElementCount VF) { 989 assert(FTy->isFloatingPointTy() && "Expected floating point type!"); 990 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); 991 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); 992 return B.CreateUIToFP(RuntimeVF, FTy); 993 } 994 995 void reportVectorizationFailure(const StringRef DebugMsg, 996 const StringRef OREMsg, const StringRef ORETag, 997 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 998 Instruction *I) { 999 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 1000 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1001 ORE->emit( 1002 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1003 << "loop not vectorized: " << OREMsg); 1004 } 1005 1006 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 1007 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1008 Instruction *I) { 1009 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 1010 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1011 ORE->emit( 1012 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1013 << Msg); 1014 } 1015 1016 } // end namespace llvm 1017 1018 #ifndef NDEBUG 1019 /// \return string containing a file name and a line # for the given loop. 1020 static std::string getDebugLocString(const Loop *L) { 1021 std::string Result; 1022 if (L) { 1023 raw_string_ostream OS(Result); 1024 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1025 LoopDbgLoc.print(OS); 1026 else 1027 // Just print the module name. 1028 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1029 OS.flush(); 1030 } 1031 return Result; 1032 } 1033 #endif 1034 1035 void InnerLoopVectorizer::collectPoisonGeneratingRecipes( 1036 VPTransformState &State) { 1037 1038 // Collect recipes in the backward slice of `Root` that may generate a poison 1039 // value that is used after vectorization. 1040 SmallPtrSet<VPRecipeBase *, 16> Visited; 1041 auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) { 1042 SmallVector<VPRecipeBase *, 16> Worklist; 1043 Worklist.push_back(Root); 1044 1045 // Traverse the backward slice of Root through its use-def chain. 1046 while (!Worklist.empty()) { 1047 VPRecipeBase *CurRec = Worklist.back(); 1048 Worklist.pop_back(); 1049 1050 if (!Visited.insert(CurRec).second) 1051 continue; 1052 1053 // Prune search if we find another recipe generating a widen memory 1054 // instruction. Widen memory instructions involved in address computation 1055 // will lead to gather/scatter instructions, which don't need to be 1056 // handled. 1057 if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || 1058 isa<VPInterleaveRecipe>(CurRec) || 1059 isa<VPScalarIVStepsRecipe>(CurRec) || 1060 isa<VPCanonicalIVPHIRecipe>(CurRec) || 1061 isa<VPActiveLaneMaskPHIRecipe>(CurRec)) 1062 continue; 1063 1064 // This recipe contributes to the address computation of a widen 1065 // load/store. Collect recipe if its underlying instruction has 1066 // poison-generating flags. 1067 Instruction *Instr = CurRec->getUnderlyingInstr(); 1068 if (Instr && Instr->hasPoisonGeneratingFlags()) 1069 State.MayGeneratePoisonRecipes.insert(CurRec); 1070 1071 // Add new definitions to the worklist. 1072 for (VPValue *operand : CurRec->operands()) 1073 if (VPRecipeBase *OpDef = operand->getDefiningRecipe()) 1074 Worklist.push_back(OpDef); 1075 } 1076 }); 1077 1078 // Traverse all the recipes in the VPlan and collect the poison-generating 1079 // recipes in the backward slice starting at the address of a VPWidenRecipe or 1080 // VPInterleaveRecipe. 1081 auto Iter = vp_depth_first_deep(State.Plan->getEntry()); 1082 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 1083 for (VPRecipeBase &Recipe : *VPBB) { 1084 if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) { 1085 Instruction &UnderlyingInstr = WidenRec->getIngredient(); 1086 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe(); 1087 if (AddrDef && WidenRec->isConsecutive() && 1088 Legal->blockNeedsPredication(UnderlyingInstr.getParent())) 1089 collectPoisonGeneratingInstrsInBackwardSlice(AddrDef); 1090 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) { 1091 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe(); 1092 if (AddrDef) { 1093 // Check if any member of the interleave group needs predication. 1094 const InterleaveGroup<Instruction> *InterGroup = 1095 InterleaveRec->getInterleaveGroup(); 1096 bool NeedPredication = false; 1097 for (int I = 0, NumMembers = InterGroup->getNumMembers(); 1098 I < NumMembers; ++I) { 1099 Instruction *Member = InterGroup->getMember(I); 1100 if (Member) 1101 NeedPredication |= 1102 Legal->blockNeedsPredication(Member->getParent()); 1103 } 1104 1105 if (NeedPredication) 1106 collectPoisonGeneratingInstrsInBackwardSlice(AddrDef); 1107 } 1108 } 1109 } 1110 } 1111 } 1112 1113 PHINode *InnerLoopVectorizer::getReductionResumeValue( 1114 const RecurrenceDescriptor &RdxDesc) { 1115 auto It = ReductionResumeValues.find(&RdxDesc); 1116 assert(It != ReductionResumeValues.end() && 1117 "Expected to find a resume value for the reduction."); 1118 return It->second; 1119 } 1120 1121 namespace llvm { 1122 1123 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1124 // lowered. 1125 enum ScalarEpilogueLowering { 1126 1127 // The default: allowing scalar epilogues. 1128 CM_ScalarEpilogueAllowed, 1129 1130 // Vectorization with OptForSize: don't allow epilogues. 1131 CM_ScalarEpilogueNotAllowedOptSize, 1132 1133 // A special case of vectorisation with OptForSize: loops with a very small 1134 // trip count are considered for vectorization under OptForSize, thereby 1135 // making sure the cost of their loop body is dominant, free of runtime 1136 // guards and scalar iteration overheads. 1137 CM_ScalarEpilogueNotAllowedLowTripLoop, 1138 1139 // Loop hint predicate indicating an epilogue is undesired. 1140 CM_ScalarEpilogueNotNeededUsePredicate, 1141 1142 // Directive indicating we must either tail fold or not vectorize 1143 CM_ScalarEpilogueNotAllowedUsePredicate 1144 }; 1145 1146 /// ElementCountComparator creates a total ordering for ElementCount 1147 /// for the purposes of using it in a set structure. 1148 struct ElementCountComparator { 1149 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const { 1150 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < 1151 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); 1152 } 1153 }; 1154 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>; 1155 1156 /// LoopVectorizationCostModel - estimates the expected speedups due to 1157 /// vectorization. 1158 /// In many cases vectorization is not profitable. This can happen because of 1159 /// a number of reasons. In this class we mainly attempt to predict the 1160 /// expected speedup/slowdowns due to the supported instruction set. We use the 1161 /// TargetTransformInfo to query the different backends for the cost of 1162 /// different operations. 1163 class LoopVectorizationCostModel { 1164 public: 1165 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1166 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1167 LoopVectorizationLegality *Legal, 1168 const TargetTransformInfo &TTI, 1169 const TargetLibraryInfo *TLI, DemandedBits *DB, 1170 AssumptionCache *AC, 1171 OptimizationRemarkEmitter *ORE, const Function *F, 1172 const LoopVectorizeHints *Hints, 1173 InterleavedAccessInfo &IAI) 1174 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1175 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1176 Hints(Hints), InterleaveInfo(IAI) {} 1177 1178 /// \return An upper bound for the vectorization factors (both fixed and 1179 /// scalable). If the factors are 0, vectorization and interleaving should be 1180 /// avoided up front. 1181 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 1182 1183 /// \return True if runtime checks are required for vectorization, and false 1184 /// otherwise. 1185 bool runtimeChecksRequired(); 1186 1187 /// \return The most profitable vectorization factor and the cost of that VF. 1188 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO 1189 /// then this vectorization factor will be selected if vectorization is 1190 /// possible. 1191 VectorizationFactor 1192 selectVectorizationFactor(const ElementCountSet &CandidateVFs); 1193 1194 VectorizationFactor 1195 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1196 const LoopVectorizationPlanner &LVP); 1197 1198 /// Setup cost-based decisions for user vectorization factor. 1199 /// \return true if the UserVF is a feasible VF to be chosen. 1200 bool selectUserVectorizationFactor(ElementCount UserVF) { 1201 collectUniformsAndScalars(UserVF); 1202 collectInstsToScalarize(UserVF); 1203 return expectedCost(UserVF).first.isValid(); 1204 } 1205 1206 /// \return The size (in bits) of the smallest and widest types in the code 1207 /// that needs to be vectorized. We ignore values that remain scalar such as 1208 /// 64 bit loop indices. 1209 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1210 1211 /// \return The desired interleave count. 1212 /// If interleave count has been specified by metadata it will be returned. 1213 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1214 /// are the selected vectorization factor and the cost of the selected VF. 1215 unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost); 1216 1217 /// Memory access instruction may be vectorized in more than one way. 1218 /// Form of instruction after vectorization depends on cost. 1219 /// This function takes cost-based decisions for Load/Store instructions 1220 /// and collects them in a map. This decisions map is used for building 1221 /// the lists of loop-uniform and loop-scalar instructions. 1222 /// The calculated cost is saved with widening decision in order to 1223 /// avoid redundant calculations. 1224 void setCostBasedWideningDecision(ElementCount VF); 1225 1226 /// A struct that represents some properties of the register usage 1227 /// of a loop. 1228 struct RegisterUsage { 1229 /// Holds the number of loop invariant values that are used in the loop. 1230 /// The key is ClassID of target-provided register class. 1231 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1232 /// Holds the maximum number of concurrent live intervals in the loop. 1233 /// The key is ClassID of target-provided register class. 1234 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1235 }; 1236 1237 /// \return Returns information about the register usages of the loop for the 1238 /// given vectorization factors. 1239 SmallVector<RegisterUsage, 8> 1240 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1241 1242 /// Collect values we want to ignore in the cost model. 1243 void collectValuesToIgnore(); 1244 1245 /// Collect all element types in the loop for which widening is needed. 1246 void collectElementTypesForWidening(); 1247 1248 /// Split reductions into those that happen in the loop, and those that happen 1249 /// outside. In loop reductions are collected into InLoopReductionChains. 1250 void collectInLoopReductions(); 1251 1252 /// Returns true if we should use strict in-order reductions for the given 1253 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1254 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1255 /// of FP operations. 1256 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const { 1257 return !Hints->allowReordering() && RdxDesc.isOrdered(); 1258 } 1259 1260 /// \returns The smallest bitwidth each instruction can be represented with. 1261 /// The vector equivalents of these instructions should be truncated to this 1262 /// type. 1263 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1264 return MinBWs; 1265 } 1266 1267 /// \returns True if it is more profitable to scalarize instruction \p I for 1268 /// vectorization factor \p VF. 1269 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1270 assert(VF.isVector() && 1271 "Profitable to scalarize relevant only for VF > 1."); 1272 1273 // Cost model is not run in the VPlan-native path - return conservative 1274 // result until this changes. 1275 if (EnableVPlanNativePath) 1276 return false; 1277 1278 auto Scalars = InstsToScalarize.find(VF); 1279 assert(Scalars != InstsToScalarize.end() && 1280 "VF not yet analyzed for scalarization profitability"); 1281 return Scalars->second.find(I) != Scalars->second.end(); 1282 } 1283 1284 /// Returns true if \p I is known to be uniform after vectorization. 1285 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1286 if (VF.isScalar()) 1287 return true; 1288 1289 // Cost model is not run in the VPlan-native path - return conservative 1290 // result until this changes. 1291 if (EnableVPlanNativePath) 1292 return false; 1293 1294 auto UniformsPerVF = Uniforms.find(VF); 1295 assert(UniformsPerVF != Uniforms.end() && 1296 "VF not yet analyzed for uniformity"); 1297 return UniformsPerVF->second.count(I); 1298 } 1299 1300 /// Returns true if \p I is known to be scalar after vectorization. 1301 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1302 if (VF.isScalar()) 1303 return true; 1304 1305 // Cost model is not run in the VPlan-native path - return conservative 1306 // result until this changes. 1307 if (EnableVPlanNativePath) 1308 return false; 1309 1310 auto ScalarsPerVF = Scalars.find(VF); 1311 assert(ScalarsPerVF != Scalars.end() && 1312 "Scalar values are not calculated for VF"); 1313 return ScalarsPerVF->second.count(I); 1314 } 1315 1316 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1317 /// for vectorization factor \p VF. 1318 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1319 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1320 !isProfitableToScalarize(I, VF) && 1321 !isScalarAfterVectorization(I, VF); 1322 } 1323 1324 /// Decision that was taken during cost calculation for memory instruction. 1325 enum InstWidening { 1326 CM_Unknown, 1327 CM_Widen, // For consecutive accesses with stride +1. 1328 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1329 CM_Interleave, 1330 CM_GatherScatter, 1331 CM_Scalarize 1332 }; 1333 1334 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1335 /// instruction \p I and vector width \p VF. 1336 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1337 InstructionCost Cost) { 1338 assert(VF.isVector() && "Expected VF >=2"); 1339 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1340 } 1341 1342 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1343 /// interleaving group \p Grp and vector width \p VF. 1344 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1345 ElementCount VF, InstWidening W, 1346 InstructionCost Cost) { 1347 assert(VF.isVector() && "Expected VF >=2"); 1348 /// Broadcast this decicion to all instructions inside the group. 1349 /// But the cost will be assigned to one instruction only. 1350 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1351 if (auto *I = Grp->getMember(i)) { 1352 if (Grp->getInsertPos() == I) 1353 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1354 else 1355 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1356 } 1357 } 1358 } 1359 1360 /// Return the cost model decision for the given instruction \p I and vector 1361 /// width \p VF. Return CM_Unknown if this instruction did not pass 1362 /// through the cost modeling. 1363 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1364 assert(VF.isVector() && "Expected VF to be a vector VF"); 1365 // Cost model is not run in the VPlan-native path - return conservative 1366 // result until this changes. 1367 if (EnableVPlanNativePath) 1368 return CM_GatherScatter; 1369 1370 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1371 auto Itr = WideningDecisions.find(InstOnVF); 1372 if (Itr == WideningDecisions.end()) 1373 return CM_Unknown; 1374 return Itr->second.first; 1375 } 1376 1377 /// Return the vectorization cost for the given instruction \p I and vector 1378 /// width \p VF. 1379 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1380 assert(VF.isVector() && "Expected VF >=2"); 1381 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1382 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1383 "The cost is not calculated"); 1384 return WideningDecisions[InstOnVF].second; 1385 } 1386 1387 /// Return True if instruction \p I is an optimizable truncate whose operand 1388 /// is an induction variable. Such a truncate will be removed by adding a new 1389 /// induction variable with the destination type. 1390 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1391 // If the instruction is not a truncate, return false. 1392 auto *Trunc = dyn_cast<TruncInst>(I); 1393 if (!Trunc) 1394 return false; 1395 1396 // Get the source and destination types of the truncate. 1397 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1398 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1399 1400 // If the truncate is free for the given types, return false. Replacing a 1401 // free truncate with an induction variable would add an induction variable 1402 // update instruction to each iteration of the loop. We exclude from this 1403 // check the primary induction variable since it will need an update 1404 // instruction regardless. 1405 Value *Op = Trunc->getOperand(0); 1406 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1407 return false; 1408 1409 // If the truncated value is not an induction variable, return false. 1410 return Legal->isInductionPhi(Op); 1411 } 1412 1413 /// Collects the instructions to scalarize for each predicated instruction in 1414 /// the loop. 1415 void collectInstsToScalarize(ElementCount VF); 1416 1417 /// Collect Uniform and Scalar values for the given \p VF. 1418 /// The sets depend on CM decision for Load/Store instructions 1419 /// that may be vectorized as interleave, gather-scatter or scalarized. 1420 void collectUniformsAndScalars(ElementCount VF) { 1421 // Do the analysis once. 1422 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1423 return; 1424 setCostBasedWideningDecision(VF); 1425 collectLoopUniforms(VF); 1426 collectLoopScalars(VF); 1427 } 1428 1429 /// Returns true if the target machine supports masked store operation 1430 /// for the given \p DataType and kind of access to \p Ptr. 1431 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1432 return Legal->isConsecutivePtr(DataType, Ptr) && 1433 TTI.isLegalMaskedStore(DataType, Alignment); 1434 } 1435 1436 /// Returns true if the target machine supports masked load operation 1437 /// for the given \p DataType and kind of access to \p Ptr. 1438 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1439 return Legal->isConsecutivePtr(DataType, Ptr) && 1440 TTI.isLegalMaskedLoad(DataType, Alignment); 1441 } 1442 1443 /// Returns true if the target machine can represent \p V as a masked gather 1444 /// or scatter operation. 1445 bool isLegalGatherOrScatter(Value *V, 1446 ElementCount VF = ElementCount::getFixed(1)) { 1447 bool LI = isa<LoadInst>(V); 1448 bool SI = isa<StoreInst>(V); 1449 if (!LI && !SI) 1450 return false; 1451 auto *Ty = getLoadStoreType(V); 1452 Align Align = getLoadStoreAlignment(V); 1453 if (VF.isVector()) 1454 Ty = VectorType::get(Ty, VF); 1455 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1456 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1457 } 1458 1459 /// Returns true if the target machine supports all of the reduction 1460 /// variables found for the given VF. 1461 bool canVectorizeReductions(ElementCount VF) const { 1462 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1463 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1464 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1465 })); 1466 } 1467 1468 /// Given costs for both strategies, return true if the scalar predication 1469 /// lowering should be used for div/rem. This incorporates an override 1470 /// option so it is not simply a cost comparison. 1471 bool isDivRemScalarWithPredication(InstructionCost ScalarCost, 1472 InstructionCost SafeDivisorCost) const { 1473 switch (ForceSafeDivisor) { 1474 case cl::BOU_UNSET: 1475 return ScalarCost < SafeDivisorCost; 1476 case cl::BOU_TRUE: 1477 return false; 1478 case cl::BOU_FALSE: 1479 return true; 1480 }; 1481 llvm_unreachable("impossible case value"); 1482 } 1483 1484 /// Returns true if \p I is an instruction which requires predication and 1485 /// for which our chosen predication strategy is scalarization (i.e. we 1486 /// don't have an alternate strategy such as masking available). 1487 /// \p VF is the vectorization factor that will be used to vectorize \p I. 1488 bool isScalarWithPredication(Instruction *I, ElementCount VF) const; 1489 1490 /// Returns true if \p I is an instruction that needs to be predicated 1491 /// at runtime. The result is independent of the predication mechanism. 1492 /// Superset of instructions that return true for isScalarWithPredication. 1493 bool isPredicatedInst(Instruction *I) const; 1494 1495 /// Return the costs for our two available strategies for lowering a 1496 /// div/rem operation which requires speculating at least one lane. 1497 /// First result is for scalarization (will be invalid for scalable 1498 /// vectors); second is for the safe-divisor strategy. 1499 std::pair<InstructionCost, InstructionCost> 1500 getDivRemSpeculationCost(Instruction *I, 1501 ElementCount VF) const; 1502 1503 /// Returns true if \p I is a memory instruction with consecutive memory 1504 /// access that can be widened. 1505 bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF); 1506 1507 /// Returns true if \p I is a memory instruction in an interleaved-group 1508 /// of memory accesses that can be vectorized with wide vector loads/stores 1509 /// and shuffles. 1510 bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF); 1511 1512 /// Check if \p Instr belongs to any interleaved access group. 1513 bool isAccessInterleaved(Instruction *Instr) { 1514 return InterleaveInfo.isInterleaved(Instr); 1515 } 1516 1517 /// Get the interleaved access group that \p Instr belongs to. 1518 const InterleaveGroup<Instruction> * 1519 getInterleavedAccessGroup(Instruction *Instr) { 1520 return InterleaveInfo.getInterleaveGroup(Instr); 1521 } 1522 1523 /// Returns true if we're required to use a scalar epilogue for at least 1524 /// the final iteration of the original loop. 1525 bool requiresScalarEpilogue(ElementCount VF) const { 1526 if (!isScalarEpilogueAllowed()) 1527 return false; 1528 // If we might exit from anywhere but the latch, must run the exiting 1529 // iteration in scalar form. 1530 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1531 return true; 1532 return VF.isVector() && InterleaveInfo.requiresScalarEpilogue(); 1533 } 1534 1535 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1536 /// loop hint annotation. 1537 bool isScalarEpilogueAllowed() const { 1538 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1539 } 1540 1541 /// Returns true if all loop blocks should be masked to fold tail loop. 1542 bool foldTailByMasking() const { return FoldTailByMasking; } 1543 1544 /// Returns true if were tail-folding and want to use the active lane mask 1545 /// for vector loop control flow. 1546 bool useActiveLaneMaskForControlFlow() const { 1547 return FoldTailByMasking && 1548 TTI.emitGetActiveLaneMask() == PredicationStyle::DataAndControlFlow; 1549 } 1550 1551 /// Returns true if the instructions in this block requires predication 1552 /// for any reason, e.g. because tail folding now requires a predicate 1553 /// or because the block in the original loop was predicated. 1554 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const { 1555 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1556 } 1557 1558 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1559 /// nodes to the chain of instructions representing the reductions. Uses a 1560 /// MapVector to ensure deterministic iteration order. 1561 using ReductionChainMap = 1562 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1563 1564 /// Return the chain of instructions representing an inloop reduction. 1565 const ReductionChainMap &getInLoopReductionChains() const { 1566 return InLoopReductionChains; 1567 } 1568 1569 /// Returns true if the Phi is part of an inloop reduction. 1570 bool isInLoopReduction(PHINode *Phi) const { 1571 return InLoopReductionChains.count(Phi); 1572 } 1573 1574 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1575 /// with factor VF. Return the cost of the instruction, including 1576 /// scalarization overhead if it's needed. 1577 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1578 1579 /// Estimate cost of a call instruction CI if it were vectorized with factor 1580 /// VF. Return the cost of the instruction, including scalarization overhead 1581 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1582 /// scalarized - 1583 /// i.e. either vector version isn't available, or is too expensive. 1584 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1585 bool &NeedToScalarize) const; 1586 1587 /// Returns true if the per-lane cost of VectorizationFactor A is lower than 1588 /// that of B. 1589 bool isMoreProfitable(const VectorizationFactor &A, 1590 const VectorizationFactor &B) const; 1591 1592 /// Invalidates decisions already taken by the cost model. 1593 void invalidateCostModelingDecisions() { 1594 WideningDecisions.clear(); 1595 Uniforms.clear(); 1596 Scalars.clear(); 1597 } 1598 1599 /// Convenience function that returns the value of vscale_range iff 1600 /// vscale_range.min == vscale_range.max or otherwise returns the value 1601 /// returned by the corresponding TLI method. 1602 std::optional<unsigned> getVScaleForTuning() const; 1603 1604 private: 1605 unsigned NumPredStores = 0; 1606 1607 /// \return An upper bound for the vectorization factors for both 1608 /// fixed and scalable vectorization, where the minimum-known number of 1609 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1610 /// disabled or unsupported, then the scalable part will be equal to 1611 /// ElementCount::getScalable(0). 1612 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, 1613 ElementCount UserVF, 1614 bool FoldTailByMasking); 1615 1616 /// \return the maximized element count based on the targets vector 1617 /// registers and the loop trip-count, but limited to a maximum safe VF. 1618 /// This is a helper function of computeFeasibleMaxVF. 1619 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, 1620 unsigned SmallestType, 1621 unsigned WidestType, 1622 ElementCount MaxSafeVF, 1623 bool FoldTailByMasking); 1624 1625 /// \return the maximum legal scalable VF, based on the safe max number 1626 /// of elements. 1627 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1628 1629 /// The vectorization cost is a combination of the cost itself and a boolean 1630 /// indicating whether any of the contributing operations will actually 1631 /// operate on vector values after type legalization in the backend. If this 1632 /// latter value is false, then all operations will be scalarized (i.e. no 1633 /// vectorization has actually taken place). 1634 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1635 1636 /// Returns the expected execution cost. The unit of the cost does 1637 /// not matter because we use the 'cost' units to compare different 1638 /// vector widths. The cost that is returned is *not* normalized by 1639 /// the factor width. If \p Invalid is not nullptr, this function 1640 /// will add a pair(Instruction*, ElementCount) to \p Invalid for 1641 /// each instruction that has an Invalid cost for the given VF. 1642 using InstructionVFPair = std::pair<Instruction *, ElementCount>; 1643 VectorizationCostTy 1644 expectedCost(ElementCount VF, 1645 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); 1646 1647 /// Returns the execution time cost of an instruction for a given vector 1648 /// width. Vector width of one means scalar. 1649 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1650 1651 /// The cost-computation logic from getInstructionCost which provides 1652 /// the vector type as an output parameter. 1653 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1654 Type *&VectorTy); 1655 1656 /// Return the cost of instructions in an inloop reduction pattern, if I is 1657 /// part of that pattern. 1658 std::optional<InstructionCost> 1659 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, 1660 TTI::TargetCostKind CostKind); 1661 1662 /// Calculate vectorization cost of memory instruction \p I. 1663 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1664 1665 /// The cost computation for scalarized memory instruction. 1666 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1667 1668 /// The cost computation for interleaving group of memory instructions. 1669 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1670 1671 /// The cost computation for Gather/Scatter instruction. 1672 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1673 1674 /// The cost computation for widening instruction \p I with consecutive 1675 /// memory access. 1676 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1677 1678 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1679 /// Load: scalar load + broadcast. 1680 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1681 /// element) 1682 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1683 1684 /// Estimate the overhead of scalarizing an instruction. This is a 1685 /// convenience wrapper for the type-based getScalarizationOverhead API. 1686 InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF, 1687 TTI::TargetCostKind CostKind) const; 1688 1689 /// Returns true if an artificially high cost for emulated masked memrefs 1690 /// should be used. 1691 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF); 1692 1693 /// Map of scalar integer values to the smallest bitwidth they can be legally 1694 /// represented as. The vector equivalents of these values should be truncated 1695 /// to this type. 1696 MapVector<Instruction *, uint64_t> MinBWs; 1697 1698 /// A type representing the costs for instructions if they were to be 1699 /// scalarized rather than vectorized. The entries are Instruction-Cost 1700 /// pairs. 1701 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1702 1703 /// A set containing all BasicBlocks that are known to present after 1704 /// vectorization as a predicated block. 1705 DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>> 1706 PredicatedBBsAfterVectorization; 1707 1708 /// Records whether it is allowed to have the original scalar loop execute at 1709 /// least once. This may be needed as a fallback loop in case runtime 1710 /// aliasing/dependence checks fail, or to handle the tail/remainder 1711 /// iterations when the trip count is unknown or doesn't divide by the VF, 1712 /// or as a peel-loop to handle gaps in interleave-groups. 1713 /// Under optsize and when the trip count is very small we don't allow any 1714 /// iterations to execute in the scalar loop. 1715 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1716 1717 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1718 bool FoldTailByMasking = false; 1719 1720 /// A map holding scalar costs for different vectorization factors. The 1721 /// presence of a cost for an instruction in the mapping indicates that the 1722 /// instruction will be scalarized when vectorizing with the associated 1723 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1724 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1725 1726 /// Holds the instructions known to be uniform after vectorization. 1727 /// The data is collected per VF. 1728 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1729 1730 /// Holds the instructions known to be scalar after vectorization. 1731 /// The data is collected per VF. 1732 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1733 1734 /// Holds the instructions (address computations) that are forced to be 1735 /// scalarized. 1736 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1737 1738 /// PHINodes of the reductions that should be expanded in-loop along with 1739 /// their associated chains of reduction operations, in program order from top 1740 /// (PHI) to bottom 1741 ReductionChainMap InLoopReductionChains; 1742 1743 /// A Map of inloop reduction operations and their immediate chain operand. 1744 /// FIXME: This can be removed once reductions can be costed correctly in 1745 /// vplan. This was added to allow quick lookup to the inloop operations, 1746 /// without having to loop through InLoopReductionChains. 1747 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1748 1749 /// Returns the expected difference in cost from scalarizing the expression 1750 /// feeding a predicated instruction \p PredInst. The instructions to 1751 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1752 /// non-negative return value implies the expression will be scalarized. 1753 /// Currently, only single-use chains are considered for scalarization. 1754 InstructionCost computePredInstDiscount(Instruction *PredInst, 1755 ScalarCostsTy &ScalarCosts, 1756 ElementCount VF); 1757 1758 /// Collect the instructions that are uniform after vectorization. An 1759 /// instruction is uniform if we represent it with a single scalar value in 1760 /// the vectorized loop corresponding to each vector iteration. Examples of 1761 /// uniform instructions include pointer operands of consecutive or 1762 /// interleaved memory accesses. Note that although uniformity implies an 1763 /// instruction will be scalar, the reverse is not true. In general, a 1764 /// scalarized instruction will be represented by VF scalar values in the 1765 /// vectorized loop, each corresponding to an iteration of the original 1766 /// scalar loop. 1767 void collectLoopUniforms(ElementCount VF); 1768 1769 /// Collect the instructions that are scalar after vectorization. An 1770 /// instruction is scalar if it is known to be uniform or will be scalarized 1771 /// during vectorization. collectLoopScalars should only add non-uniform nodes 1772 /// to the list if they are used by a load/store instruction that is marked as 1773 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by 1774 /// VF values in the vectorized loop, each corresponding to an iteration of 1775 /// the original scalar loop. 1776 void collectLoopScalars(ElementCount VF); 1777 1778 /// Keeps cost model vectorization decision and cost for instructions. 1779 /// Right now it is used for memory instructions only. 1780 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1781 std::pair<InstWidening, InstructionCost>>; 1782 1783 DecisionList WideningDecisions; 1784 1785 /// Returns true if \p V is expected to be vectorized and it needs to be 1786 /// extracted. 1787 bool needsExtract(Value *V, ElementCount VF) const { 1788 Instruction *I = dyn_cast<Instruction>(V); 1789 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1790 TheLoop->isLoopInvariant(I)) 1791 return false; 1792 1793 // Assume we can vectorize V (and hence we need extraction) if the 1794 // scalars are not computed yet. This can happen, because it is called 1795 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1796 // the scalars are collected. That should be a safe assumption in most 1797 // cases, because we check if the operands have vectorizable types 1798 // beforehand in LoopVectorizationLegality. 1799 return Scalars.find(VF) == Scalars.end() || 1800 !isScalarAfterVectorization(I, VF); 1801 }; 1802 1803 /// Returns a range containing only operands needing to be extracted. 1804 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1805 ElementCount VF) const { 1806 return SmallVector<Value *, 4>(make_filter_range( 1807 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1808 } 1809 1810 /// Determines if we have the infrastructure to vectorize loop \p L and its 1811 /// epilogue, assuming the main loop is vectorized by \p VF. 1812 bool isCandidateForEpilogueVectorization(const Loop &L, 1813 const ElementCount VF) const; 1814 1815 /// Returns true if epilogue vectorization is considered profitable, and 1816 /// false otherwise. 1817 /// \p VF is the vectorization factor chosen for the original loop. 1818 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1819 1820 public: 1821 /// The loop that we evaluate. 1822 Loop *TheLoop; 1823 1824 /// Predicated scalar evolution analysis. 1825 PredicatedScalarEvolution &PSE; 1826 1827 /// Loop Info analysis. 1828 LoopInfo *LI; 1829 1830 /// Vectorization legality. 1831 LoopVectorizationLegality *Legal; 1832 1833 /// Vector target information. 1834 const TargetTransformInfo &TTI; 1835 1836 /// Target Library Info. 1837 const TargetLibraryInfo *TLI; 1838 1839 /// Demanded bits analysis. 1840 DemandedBits *DB; 1841 1842 /// Assumption cache. 1843 AssumptionCache *AC; 1844 1845 /// Interface to emit optimization remarks. 1846 OptimizationRemarkEmitter *ORE; 1847 1848 const Function *TheFunction; 1849 1850 /// Loop Vectorize Hint. 1851 const LoopVectorizeHints *Hints; 1852 1853 /// The interleave access information contains groups of interleaved accesses 1854 /// with the same stride and close to each other. 1855 InterleavedAccessInfo &InterleaveInfo; 1856 1857 /// Values to ignore in the cost model. 1858 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1859 1860 /// Values to ignore in the cost model when VF > 1. 1861 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1862 1863 /// All element types found in the loop. 1864 SmallPtrSet<Type *, 16> ElementTypesInLoop; 1865 1866 /// Profitable vector factors. 1867 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1868 }; 1869 } // end namespace llvm 1870 1871 namespace { 1872 /// Helper struct to manage generating runtime checks for vectorization. 1873 /// 1874 /// The runtime checks are created up-front in temporary blocks to allow better 1875 /// estimating the cost and un-linked from the existing IR. After deciding to 1876 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1877 /// temporary blocks are completely removed. 1878 class GeneratedRTChecks { 1879 /// Basic block which contains the generated SCEV checks, if any. 1880 BasicBlock *SCEVCheckBlock = nullptr; 1881 1882 /// The value representing the result of the generated SCEV checks. If it is 1883 /// nullptr, either no SCEV checks have been generated or they have been used. 1884 Value *SCEVCheckCond = nullptr; 1885 1886 /// Basic block which contains the generated memory runtime checks, if any. 1887 BasicBlock *MemCheckBlock = nullptr; 1888 1889 /// The value representing the result of the generated memory runtime checks. 1890 /// If it is nullptr, either no memory runtime checks have been generated or 1891 /// they have been used. 1892 Value *MemRuntimeCheckCond = nullptr; 1893 1894 DominatorTree *DT; 1895 LoopInfo *LI; 1896 TargetTransformInfo *TTI; 1897 1898 SCEVExpander SCEVExp; 1899 SCEVExpander MemCheckExp; 1900 1901 bool CostTooHigh = false; 1902 1903 public: 1904 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1905 TargetTransformInfo *TTI, const DataLayout &DL) 1906 : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"), 1907 MemCheckExp(SE, DL, "scev.check") {} 1908 1909 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1910 /// accurately estimate the cost of the runtime checks. The blocks are 1911 /// un-linked from the IR and is added back during vector code generation. If 1912 /// there is no vector code generation, the check blocks are removed 1913 /// completely. 1914 void Create(Loop *L, const LoopAccessInfo &LAI, 1915 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) { 1916 1917 // Hard cutoff to limit compile-time increase in case a very large number of 1918 // runtime checks needs to be generated. 1919 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to 1920 // profile info. 1921 CostTooHigh = 1922 LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold; 1923 if (CostTooHigh) 1924 return; 1925 1926 BasicBlock *LoopHeader = L->getHeader(); 1927 BasicBlock *Preheader = L->getLoopPreheader(); 1928 1929 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1930 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1931 // may be used by SCEVExpander. The blocks will be un-linked from their 1932 // predecessors and removed from LI & DT at the end of the function. 1933 if (!UnionPred.isAlwaysTrue()) { 1934 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1935 nullptr, "vector.scevcheck"); 1936 1937 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1938 &UnionPred, SCEVCheckBlock->getTerminator()); 1939 } 1940 1941 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 1942 if (RtPtrChecking.Need) { 1943 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 1944 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 1945 "vector.memcheck"); 1946 1947 auto DiffChecks = RtPtrChecking.getDiffChecks(); 1948 if (DiffChecks) { 1949 Value *RuntimeVF = nullptr; 1950 MemRuntimeCheckCond = addDiffRuntimeChecks( 1951 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp, 1952 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) { 1953 if (!RuntimeVF) 1954 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF); 1955 return RuntimeVF; 1956 }, 1957 IC); 1958 } else { 1959 MemRuntimeCheckCond = 1960 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 1961 RtPtrChecking.getChecks(), MemCheckExp); 1962 } 1963 assert(MemRuntimeCheckCond && 1964 "no RT checks generated although RtPtrChecking " 1965 "claimed checks are required"); 1966 } 1967 1968 if (!MemCheckBlock && !SCEVCheckBlock) 1969 return; 1970 1971 // Unhook the temporary block with the checks, update various places 1972 // accordingly. 1973 if (SCEVCheckBlock) 1974 SCEVCheckBlock->replaceAllUsesWith(Preheader); 1975 if (MemCheckBlock) 1976 MemCheckBlock->replaceAllUsesWith(Preheader); 1977 1978 if (SCEVCheckBlock) { 1979 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1980 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 1981 Preheader->getTerminator()->eraseFromParent(); 1982 } 1983 if (MemCheckBlock) { 1984 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1985 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 1986 Preheader->getTerminator()->eraseFromParent(); 1987 } 1988 1989 DT->changeImmediateDominator(LoopHeader, Preheader); 1990 if (MemCheckBlock) { 1991 DT->eraseNode(MemCheckBlock); 1992 LI->removeBlock(MemCheckBlock); 1993 } 1994 if (SCEVCheckBlock) { 1995 DT->eraseNode(SCEVCheckBlock); 1996 LI->removeBlock(SCEVCheckBlock); 1997 } 1998 } 1999 2000 InstructionCost getCost() { 2001 if (SCEVCheckBlock || MemCheckBlock) 2002 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n"); 2003 2004 if (CostTooHigh) { 2005 InstructionCost Cost; 2006 Cost.setInvalid(); 2007 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n"); 2008 return Cost; 2009 } 2010 2011 InstructionCost RTCheckCost = 0; 2012 if (SCEVCheckBlock) 2013 for (Instruction &I : *SCEVCheckBlock) { 2014 if (SCEVCheckBlock->getTerminator() == &I) 2015 continue; 2016 InstructionCost C = 2017 TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); 2018 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); 2019 RTCheckCost += C; 2020 } 2021 if (MemCheckBlock) 2022 for (Instruction &I : *MemCheckBlock) { 2023 if (MemCheckBlock->getTerminator() == &I) 2024 continue; 2025 InstructionCost C = 2026 TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); 2027 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); 2028 RTCheckCost += C; 2029 } 2030 2031 if (SCEVCheckBlock || MemCheckBlock) 2032 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost 2033 << "\n"); 2034 2035 return RTCheckCost; 2036 } 2037 2038 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2039 /// unused. 2040 ~GeneratedRTChecks() { 2041 SCEVExpanderCleaner SCEVCleaner(SCEVExp); 2042 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp); 2043 if (!SCEVCheckCond) 2044 SCEVCleaner.markResultUsed(); 2045 2046 if (!MemRuntimeCheckCond) 2047 MemCheckCleaner.markResultUsed(); 2048 2049 if (MemRuntimeCheckCond) { 2050 auto &SE = *MemCheckExp.getSE(); 2051 // Memory runtime check generation creates compares that use expanded 2052 // values. Remove them before running the SCEVExpanderCleaners. 2053 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2054 if (MemCheckExp.isInsertedInstruction(&I)) 2055 continue; 2056 SE.forgetValue(&I); 2057 I.eraseFromParent(); 2058 } 2059 } 2060 MemCheckCleaner.cleanup(); 2061 SCEVCleaner.cleanup(); 2062 2063 if (SCEVCheckCond) 2064 SCEVCheckBlock->eraseFromParent(); 2065 if (MemRuntimeCheckCond) 2066 MemCheckBlock->eraseFromParent(); 2067 } 2068 2069 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2070 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2071 /// depending on the generated condition. 2072 BasicBlock *emitSCEVChecks(BasicBlock *Bypass, 2073 BasicBlock *LoopVectorPreHeader, 2074 BasicBlock *LoopExitBlock) { 2075 if (!SCEVCheckCond) 2076 return nullptr; 2077 2078 Value *Cond = SCEVCheckCond; 2079 // Mark the check as used, to prevent it from being removed during cleanup. 2080 SCEVCheckCond = nullptr; 2081 if (auto *C = dyn_cast<ConstantInt>(Cond)) 2082 if (C->isZero()) 2083 return nullptr; 2084 2085 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2086 2087 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2088 // Create new preheader for vector loop. 2089 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2090 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2091 2092 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2093 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2094 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2095 SCEVCheckBlock); 2096 2097 DT->addNewBlock(SCEVCheckBlock, Pred); 2098 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2099 2100 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), 2101 BranchInst::Create(Bypass, LoopVectorPreHeader, Cond)); 2102 return SCEVCheckBlock; 2103 } 2104 2105 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2106 /// the branches to branch to the vector preheader or \p Bypass, depending on 2107 /// the generated condition. 2108 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass, 2109 BasicBlock *LoopVectorPreHeader) { 2110 // Check if we generated code that checks in runtime if arrays overlap. 2111 if (!MemRuntimeCheckCond) 2112 return nullptr; 2113 2114 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2115 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2116 MemCheckBlock); 2117 2118 DT->addNewBlock(MemCheckBlock, Pred); 2119 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2120 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2121 2122 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2123 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2124 2125 ReplaceInstWithInst( 2126 MemCheckBlock->getTerminator(), 2127 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2128 MemCheckBlock->getTerminator()->setDebugLoc( 2129 Pred->getTerminator()->getDebugLoc()); 2130 2131 // Mark the check as used, to prevent it from being removed during cleanup. 2132 MemRuntimeCheckCond = nullptr; 2133 return MemCheckBlock; 2134 } 2135 }; 2136 } // namespace 2137 2138 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2139 // vectorization. The loop needs to be annotated with #pragma omp simd 2140 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2141 // vector length information is not provided, vectorization is not considered 2142 // explicit. Interleave hints are not allowed either. These limitations will be 2143 // relaxed in the future. 2144 // Please, note that we are currently forced to abuse the pragma 'clang 2145 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2146 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2147 // provides *explicit vectorization hints* (LV can bypass legal checks and 2148 // assume that vectorization is legal). However, both hints are implemented 2149 // using the same metadata (llvm.loop.vectorize, processed by 2150 // LoopVectorizeHints). This will be fixed in the future when the native IR 2151 // representation for pragma 'omp simd' is introduced. 2152 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2153 OptimizationRemarkEmitter *ORE) { 2154 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2155 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2156 2157 // Only outer loops with an explicit vectorization hint are supported. 2158 // Unannotated outer loops are ignored. 2159 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2160 return false; 2161 2162 Function *Fn = OuterLp->getHeader()->getParent(); 2163 if (!Hints.allowVectorization(Fn, OuterLp, 2164 true /*VectorizeOnlyWhenForced*/)) { 2165 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2166 return false; 2167 } 2168 2169 if (Hints.getInterleave() > 1) { 2170 // TODO: Interleave support is future work. 2171 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2172 "outer loops.\n"); 2173 Hints.emitRemarkWithHints(); 2174 return false; 2175 } 2176 2177 return true; 2178 } 2179 2180 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2181 OptimizationRemarkEmitter *ORE, 2182 SmallVectorImpl<Loop *> &V) { 2183 // Collect inner loops and outer loops without irreducible control flow. For 2184 // now, only collect outer loops that have explicit vectorization hints. If we 2185 // are stress testing the VPlan H-CFG construction, we collect the outermost 2186 // loop of every loop nest. 2187 if (L.isInnermost() || VPlanBuildStressTest || 2188 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2189 LoopBlocksRPO RPOT(&L); 2190 RPOT.perform(LI); 2191 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2192 V.push_back(&L); 2193 // TODO: Collect inner loops inside marked outer loops in case 2194 // vectorization fails for the outer loop. Do not invoke 2195 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2196 // already known to be reducible. We can use an inherited attribute for 2197 // that. 2198 return; 2199 } 2200 } 2201 for (Loop *InnerL : L) 2202 collectSupportedLoops(*InnerL, LI, ORE, V); 2203 } 2204 2205 namespace { 2206 2207 /// The LoopVectorize Pass. 2208 struct LoopVectorize : public FunctionPass { 2209 /// Pass identification, replacement for typeid 2210 static char ID; 2211 2212 LoopVectorizePass Impl; 2213 2214 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2215 bool VectorizeOnlyWhenForced = false) 2216 : FunctionPass(ID), 2217 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2218 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2219 } 2220 2221 bool runOnFunction(Function &F) override { 2222 if (skipFunction(F)) 2223 return false; 2224 2225 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2226 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2227 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2228 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2229 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2230 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2231 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2232 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2233 auto &LAIs = getAnalysis<LoopAccessLegacyAnalysis>().getLAIs(); 2234 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2235 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2236 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2237 2238 return Impl 2239 .runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AC, LAIs, *ORE, PSI) 2240 .MadeAnyChange; 2241 } 2242 2243 void getAnalysisUsage(AnalysisUsage &AU) const override { 2244 AU.addRequired<AssumptionCacheTracker>(); 2245 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2246 AU.addRequired<DominatorTreeWrapperPass>(); 2247 AU.addRequired<LoopInfoWrapperPass>(); 2248 AU.addRequired<ScalarEvolutionWrapperPass>(); 2249 AU.addRequired<TargetTransformInfoWrapperPass>(); 2250 AU.addRequired<LoopAccessLegacyAnalysis>(); 2251 AU.addRequired<DemandedBitsWrapperPass>(); 2252 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2253 AU.addRequired<InjectTLIMappingsLegacy>(); 2254 2255 // We currently do not preserve loopinfo/dominator analyses with outer loop 2256 // vectorization. Until this is addressed, mark these analyses as preserved 2257 // only for non-VPlan-native path. 2258 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2259 if (!EnableVPlanNativePath) { 2260 AU.addPreserved<LoopInfoWrapperPass>(); 2261 AU.addPreserved<DominatorTreeWrapperPass>(); 2262 } 2263 2264 AU.addPreserved<BasicAAWrapperPass>(); 2265 AU.addPreserved<GlobalsAAWrapperPass>(); 2266 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2267 } 2268 }; 2269 2270 } // end anonymous namespace 2271 2272 //===----------------------------------------------------------------------===// 2273 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2274 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2275 //===----------------------------------------------------------------------===// 2276 2277 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2278 // We need to place the broadcast of invariant variables outside the loop, 2279 // but only if it's proven safe to do so. Else, broadcast will be inside 2280 // vector loop body. 2281 Instruction *Instr = dyn_cast<Instruction>(V); 2282 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2283 (!Instr || 2284 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2285 // Place the code for broadcasting invariant variables in the new preheader. 2286 IRBuilder<>::InsertPointGuard Guard(Builder); 2287 if (SafeToHoist) 2288 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2289 2290 // Broadcast the scalar into all locations in the vector. 2291 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2292 2293 return Shuf; 2294 } 2295 2296 /// This function adds 2297 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 2298 /// to each vector element of Val. The sequence starts at StartIndex. 2299 /// \p Opcode is relevant for FP induction variable. 2300 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, 2301 Instruction::BinaryOps BinOp, ElementCount VF, 2302 IRBuilderBase &Builder) { 2303 assert(VF.isVector() && "only vector VFs are supported"); 2304 2305 // Create and check the types. 2306 auto *ValVTy = cast<VectorType>(Val->getType()); 2307 ElementCount VLen = ValVTy->getElementCount(); 2308 2309 Type *STy = Val->getType()->getScalarType(); 2310 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2311 "Induction Step must be an integer or FP"); 2312 assert(Step->getType() == STy && "Step has wrong type"); 2313 2314 SmallVector<Constant *, 8> Indices; 2315 2316 // Create a vector of consecutive numbers from zero to VF. 2317 VectorType *InitVecValVTy = ValVTy; 2318 if (STy->isFloatingPointTy()) { 2319 Type *InitVecValSTy = 2320 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2321 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2322 } 2323 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2324 2325 // Splat the StartIdx 2326 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); 2327 2328 if (STy->isIntegerTy()) { 2329 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2330 Step = Builder.CreateVectorSplat(VLen, Step); 2331 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2332 // FIXME: The newly created binary instructions should contain nsw/nuw 2333 // flags, which can be found from the original scalar operations. 2334 Step = Builder.CreateMul(InitVec, Step); 2335 return Builder.CreateAdd(Val, Step, "induction"); 2336 } 2337 2338 // Floating point induction. 2339 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2340 "Binary Opcode should be specified for FP induction"); 2341 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2342 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); 2343 2344 Step = Builder.CreateVectorSplat(VLen, Step); 2345 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2346 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2347 } 2348 2349 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 2350 /// variable on which to base the steps, \p Step is the size of the step. 2351 static void buildScalarSteps(Value *ScalarIV, Value *Step, 2352 const InductionDescriptor &ID, VPValue *Def, 2353 VPTransformState &State) { 2354 IRBuilderBase &Builder = State.Builder; 2355 2356 // Ensure step has the same type as that of scalar IV. 2357 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2358 if (ScalarIVTy != Step->getType()) { 2359 // TODO: Also use VPDerivedIVRecipe when only the step needs truncating, to 2360 // avoid separate truncate here. 2361 assert(Step->getType()->isIntegerTy() && 2362 "Truncation requires an integer step"); 2363 Step = State.Builder.CreateTrunc(Step, ScalarIVTy); 2364 } 2365 2366 // We build scalar steps for both integer and floating-point induction 2367 // variables. Here, we determine the kind of arithmetic we will perform. 2368 Instruction::BinaryOps AddOp; 2369 Instruction::BinaryOps MulOp; 2370 if (ScalarIVTy->isIntegerTy()) { 2371 AddOp = Instruction::Add; 2372 MulOp = Instruction::Mul; 2373 } else { 2374 AddOp = ID.getInductionOpcode(); 2375 MulOp = Instruction::FMul; 2376 } 2377 2378 // Determine the number of scalars we need to generate for each unroll 2379 // iteration. 2380 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def); 2381 // Compute the scalar steps and save the results in State. 2382 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2383 ScalarIVTy->getScalarSizeInBits()); 2384 Type *VecIVTy = nullptr; 2385 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2386 if (!FirstLaneOnly && State.VF.isScalable()) { 2387 VecIVTy = VectorType::get(ScalarIVTy, State.VF); 2388 UnitStepVec = 2389 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); 2390 SplatStep = Builder.CreateVectorSplat(State.VF, Step); 2391 SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV); 2392 } 2393 2394 unsigned StartPart = 0; 2395 unsigned EndPart = State.UF; 2396 unsigned StartLane = 0; 2397 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); 2398 if (State.Instance) { 2399 StartPart = State.Instance->Part; 2400 EndPart = StartPart + 1; 2401 StartLane = State.Instance->Lane.getKnownLane(); 2402 EndLane = StartLane + 1; 2403 } 2404 for (unsigned Part = StartPart; Part < EndPart; ++Part) { 2405 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); 2406 2407 if (!FirstLaneOnly && State.VF.isScalable()) { 2408 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); 2409 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2410 if (ScalarIVTy->isFloatingPointTy()) 2411 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2412 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2413 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2414 State.set(Def, Add, Part); 2415 // It's useful to record the lane values too for the known minimum number 2416 // of elements so we do those below. This improves the code quality when 2417 // trying to extract the first element, for example. 2418 } 2419 2420 if (ScalarIVTy->isFloatingPointTy()) 2421 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2422 2423 for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) { 2424 Value *StartIdx = Builder.CreateBinOp( 2425 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2426 // The step returned by `createStepForVF` is a runtime-evaluated value 2427 // when VF is scalable. Otherwise, it should be folded into a Constant. 2428 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && 2429 "Expected StartIdx to be folded to a constant when VF is not " 2430 "scalable"); 2431 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2432 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2433 State.set(Def, Add, VPIteration(Part, Lane)); 2434 } 2435 } 2436 } 2437 2438 // Generate code for the induction step. Note that induction steps are 2439 // required to be loop-invariant 2440 static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE, 2441 Instruction *InsertBefore, 2442 Loop *OrigLoop = nullptr) { 2443 const DataLayout &DL = SE.getDataLayout(); 2444 assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) && 2445 "Induction step should be loop invariant"); 2446 if (auto *E = dyn_cast<SCEVUnknown>(Step)) 2447 return E->getValue(); 2448 2449 SCEVExpander Exp(SE, DL, "induction"); 2450 return Exp.expandCodeFor(Step, Step->getType(), InsertBefore); 2451 } 2452 2453 /// Compute the transformed value of Index at offset StartValue using step 2454 /// StepValue. 2455 /// For integer induction, returns StartValue + Index * StepValue. 2456 /// For pointer induction, returns StartValue[Index * StepValue]. 2457 /// FIXME: The newly created binary instructions should contain nsw/nuw 2458 /// flags, which can be found from the original scalar operations. 2459 static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index, 2460 Value *StartValue, Value *Step, 2461 const InductionDescriptor &ID) { 2462 Type *StepTy = Step->getType(); 2463 Value *CastedIndex = StepTy->isIntegerTy() 2464 ? B.CreateSExtOrTrunc(Index, StepTy) 2465 : B.CreateCast(Instruction::SIToFP, Index, StepTy); 2466 if (CastedIndex != Index) { 2467 CastedIndex->setName(CastedIndex->getName() + ".cast"); 2468 Index = CastedIndex; 2469 } 2470 2471 // Note: the IR at this point is broken. We cannot use SE to create any new 2472 // SCEV and then expand it, hoping that SCEV's simplification will give us 2473 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2474 // lead to various SCEV crashes. So all we can do is to use builder and rely 2475 // on InstCombine for future simplifications. Here we handle some trivial 2476 // cases only. 2477 auto CreateAdd = [&B](Value *X, Value *Y) { 2478 assert(X->getType() == Y->getType() && "Types don't match!"); 2479 if (auto *CX = dyn_cast<ConstantInt>(X)) 2480 if (CX->isZero()) 2481 return Y; 2482 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2483 if (CY->isZero()) 2484 return X; 2485 return B.CreateAdd(X, Y); 2486 }; 2487 2488 // We allow X to be a vector type, in which case Y will potentially be 2489 // splatted into a vector with the same element count. 2490 auto CreateMul = [&B](Value *X, Value *Y) { 2491 assert(X->getType()->getScalarType() == Y->getType() && 2492 "Types don't match!"); 2493 if (auto *CX = dyn_cast<ConstantInt>(X)) 2494 if (CX->isOne()) 2495 return Y; 2496 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2497 if (CY->isOne()) 2498 return X; 2499 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 2500 if (XVTy && !isa<VectorType>(Y->getType())) 2501 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 2502 return B.CreateMul(X, Y); 2503 }; 2504 2505 switch (ID.getKind()) { 2506 case InductionDescriptor::IK_IntInduction: { 2507 assert(!isa<VectorType>(Index->getType()) && 2508 "Vector indices not supported for integer inductions yet"); 2509 assert(Index->getType() == StartValue->getType() && 2510 "Index type does not match StartValue type"); 2511 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne()) 2512 return B.CreateSub(StartValue, Index); 2513 auto *Offset = CreateMul(Index, Step); 2514 return CreateAdd(StartValue, Offset); 2515 } 2516 case InductionDescriptor::IK_PtrInduction: { 2517 assert(isa<Constant>(Step) && 2518 "Expected constant step for pointer induction"); 2519 return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step)); 2520 } 2521 case InductionDescriptor::IK_FpInduction: { 2522 assert(!isa<VectorType>(Index->getType()) && 2523 "Vector indices not supported for FP inductions yet"); 2524 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2525 auto InductionBinOp = ID.getInductionBinOp(); 2526 assert(InductionBinOp && 2527 (InductionBinOp->getOpcode() == Instruction::FAdd || 2528 InductionBinOp->getOpcode() == Instruction::FSub) && 2529 "Original bin op should be defined for FP induction"); 2530 2531 Value *MulExp = B.CreateFMul(Step, Index); 2532 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2533 "induction"); 2534 } 2535 case InductionDescriptor::IK_NoInduction: 2536 return nullptr; 2537 } 2538 llvm_unreachable("invalid enum"); 2539 } 2540 2541 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2542 const VPIteration &Instance, 2543 VPTransformState &State) { 2544 Value *ScalarInst = State.get(Def, Instance); 2545 Value *VectorValue = State.get(Def, Instance.Part); 2546 VectorValue = Builder.CreateInsertElement( 2547 VectorValue, ScalarInst, 2548 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2549 State.set(Def, VectorValue, Instance.Part); 2550 } 2551 2552 // Return whether we allow using masked interleave-groups (for dealing with 2553 // strided loads/stores that reside in predicated blocks, or for dealing 2554 // with gaps). 2555 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2556 // If an override option has been passed in for interleaved accesses, use it. 2557 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2558 return EnableMaskedInterleavedMemAccesses; 2559 2560 return TTI.enableMaskedInterleavedAccessVectorization(); 2561 } 2562 2563 // Try to vectorize the interleave group that \p Instr belongs to. 2564 // 2565 // E.g. Translate following interleaved load group (factor = 3): 2566 // for (i = 0; i < N; i+=3) { 2567 // R = Pic[i]; // Member of index 0 2568 // G = Pic[i+1]; // Member of index 1 2569 // B = Pic[i+2]; // Member of index 2 2570 // ... // do something to R, G, B 2571 // } 2572 // To: 2573 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2574 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2575 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2576 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2577 // 2578 // Or translate following interleaved store group (factor = 3): 2579 // for (i = 0; i < N; i+=3) { 2580 // ... do something to R, G, B 2581 // Pic[i] = R; // Member of index 0 2582 // Pic[i+1] = G; // Member of index 1 2583 // Pic[i+2] = B; // Member of index 2 2584 // } 2585 // To: 2586 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2587 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2588 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2589 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2590 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2591 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2592 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2593 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2594 VPValue *BlockInMask) { 2595 Instruction *Instr = Group->getInsertPos(); 2596 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2597 2598 // Prepare for the vector type of the interleaved load/store. 2599 Type *ScalarTy = getLoadStoreType(Instr); 2600 unsigned InterleaveFactor = Group->getFactor(); 2601 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2602 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2603 2604 // Prepare for the new pointers. 2605 SmallVector<Value *, 2> AddrParts; 2606 unsigned Index = Group->getIndex(Instr); 2607 2608 // TODO: extend the masked interleaved-group support to reversed access. 2609 assert((!BlockInMask || !Group->isReverse()) && 2610 "Reversed masked interleave-group not supported."); 2611 2612 // If the group is reverse, adjust the index to refer to the last vector lane 2613 // instead of the first. We adjust the index from the first vector lane, 2614 // rather than directly getting the pointer for lane VF - 1, because the 2615 // pointer operand of the interleaved access is supposed to be uniform. For 2616 // uniform instructions, we're only required to generate a value for the 2617 // first vector lane in each unroll iteration. 2618 if (Group->isReverse()) 2619 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2620 2621 for (unsigned Part = 0; Part < UF; Part++) { 2622 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2623 State.setDebugLocFromInst(AddrPart); 2624 2625 // Notice current instruction could be any index. Need to adjust the address 2626 // to the member of index 0. 2627 // 2628 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2629 // b = A[i]; // Member of index 0 2630 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2631 // 2632 // E.g. A[i+1] = a; // Member of index 1 2633 // A[i] = b; // Member of index 0 2634 // A[i+2] = c; // Member of index 2 (Current instruction) 2635 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2636 2637 bool InBounds = false; 2638 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2639 InBounds = gep->isInBounds(); 2640 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2641 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2642 2643 // Cast to the vector pointer type. 2644 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2645 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2646 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2647 } 2648 2649 State.setDebugLocFromInst(Instr); 2650 Value *PoisonVec = PoisonValue::get(VecTy); 2651 2652 Value *MaskForGaps = nullptr; 2653 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2654 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2655 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2656 } 2657 2658 // Vectorize the interleaved load group. 2659 if (isa<LoadInst>(Instr)) { 2660 // For each unroll part, create a wide load for the group. 2661 SmallVector<Value *, 2> NewLoads; 2662 for (unsigned Part = 0; Part < UF; Part++) { 2663 Instruction *NewLoad; 2664 if (BlockInMask || MaskForGaps) { 2665 assert(useMaskedInterleavedAccesses(*TTI) && 2666 "masked interleaved groups are not allowed."); 2667 Value *GroupMask = MaskForGaps; 2668 if (BlockInMask) { 2669 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2670 Value *ShuffledMask = Builder.CreateShuffleVector( 2671 BlockInMaskPart, 2672 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2673 "interleaved.mask"); 2674 GroupMask = MaskForGaps 2675 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2676 MaskForGaps) 2677 : ShuffledMask; 2678 } 2679 NewLoad = 2680 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), 2681 GroupMask, PoisonVec, "wide.masked.vec"); 2682 } 2683 else 2684 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2685 Group->getAlign(), "wide.vec"); 2686 Group->addMetadata(NewLoad); 2687 NewLoads.push_back(NewLoad); 2688 } 2689 2690 // For each member in the group, shuffle out the appropriate data from the 2691 // wide loads. 2692 unsigned J = 0; 2693 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2694 Instruction *Member = Group->getMember(I); 2695 2696 // Skip the gaps in the group. 2697 if (!Member) 2698 continue; 2699 2700 auto StrideMask = 2701 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2702 for (unsigned Part = 0; Part < UF; Part++) { 2703 Value *StridedVec = Builder.CreateShuffleVector( 2704 NewLoads[Part], StrideMask, "strided.vec"); 2705 2706 // If this member has different type, cast the result type. 2707 if (Member->getType() != ScalarTy) { 2708 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2709 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2710 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2711 } 2712 2713 if (Group->isReverse()) 2714 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); 2715 2716 State.set(VPDefs[J], StridedVec, Part); 2717 } 2718 ++J; 2719 } 2720 return; 2721 } 2722 2723 // The sub vector type for current instruction. 2724 auto *SubVT = VectorType::get(ScalarTy, VF); 2725 2726 // Vectorize the interleaved store group. 2727 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2728 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && 2729 "masked interleaved groups are not allowed."); 2730 assert((!MaskForGaps || !VF.isScalable()) && 2731 "masking gaps for scalable vectors is not yet supported."); 2732 for (unsigned Part = 0; Part < UF; Part++) { 2733 // Collect the stored vector from each member. 2734 SmallVector<Value *, 4> StoredVecs; 2735 unsigned StoredIdx = 0; 2736 for (unsigned i = 0; i < InterleaveFactor; i++) { 2737 assert((Group->getMember(i) || MaskForGaps) && 2738 "Fail to get a member from an interleaved store group"); 2739 Instruction *Member = Group->getMember(i); 2740 2741 // Skip the gaps in the group. 2742 if (!Member) { 2743 Value *Undef = PoisonValue::get(SubVT); 2744 StoredVecs.push_back(Undef); 2745 continue; 2746 } 2747 2748 Value *StoredVec = State.get(StoredValues[StoredIdx], Part); 2749 ++StoredIdx; 2750 2751 if (Group->isReverse()) 2752 StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse"); 2753 2754 // If this member has different type, cast it to a unified type. 2755 2756 if (StoredVec->getType() != SubVT) 2757 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2758 2759 StoredVecs.push_back(StoredVec); 2760 } 2761 2762 // Concatenate all vectors into a wide vector. 2763 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2764 2765 // Interleave the elements in the wide vector. 2766 Value *IVec = Builder.CreateShuffleVector( 2767 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2768 "interleaved.vec"); 2769 2770 Instruction *NewStoreInstr; 2771 if (BlockInMask || MaskForGaps) { 2772 Value *GroupMask = MaskForGaps; 2773 if (BlockInMask) { 2774 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2775 Value *ShuffledMask = Builder.CreateShuffleVector( 2776 BlockInMaskPart, 2777 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2778 "interleaved.mask"); 2779 GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, 2780 ShuffledMask, MaskForGaps) 2781 : ShuffledMask; 2782 } 2783 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], 2784 Group->getAlign(), GroupMask); 2785 } else 2786 NewStoreInstr = 2787 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2788 2789 Group->addMetadata(NewStoreInstr); 2790 } 2791 } 2792 2793 void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr, 2794 VPReplicateRecipe *RepRecipe, 2795 const VPIteration &Instance, 2796 bool IfPredicateInstr, 2797 VPTransformState &State) { 2798 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2799 2800 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 2801 // the first lane and part. 2802 if (isa<NoAliasScopeDeclInst>(Instr)) 2803 if (!Instance.isFirstIteration()) 2804 return; 2805 2806 // Does this instruction return a value ? 2807 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2808 2809 Instruction *Cloned = Instr->clone(); 2810 if (!IsVoidRetTy) 2811 Cloned->setName(Instr->getName() + ".cloned"); 2812 2813 // If the scalarized instruction contributes to the address computation of a 2814 // widen masked load/store which was in a basic block that needed predication 2815 // and is not predicated after vectorization, we can't propagate 2816 // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized 2817 // instruction could feed a poison value to the base address of the widen 2818 // load/store. 2819 if (State.MayGeneratePoisonRecipes.contains(RepRecipe)) 2820 Cloned->dropPoisonGeneratingFlags(); 2821 2822 if (Instr->getDebugLoc()) 2823 State.setDebugLocFromInst(Instr); 2824 2825 // Replace the operands of the cloned instructions with their scalar 2826 // equivalents in the new loop. 2827 for (const auto &I : enumerate(RepRecipe->operands())) { 2828 auto InputInstance = Instance; 2829 VPValue *Operand = I.value(); 2830 if (vputils::isUniformAfterVectorization(Operand)) 2831 InputInstance.Lane = VPLane::getFirstLane(); 2832 Cloned->setOperand(I.index(), State.get(Operand, InputInstance)); 2833 } 2834 State.addNewMetadata(Cloned, Instr); 2835 2836 // Place the cloned scalar in the new loop. 2837 State.Builder.Insert(Cloned); 2838 2839 State.set(RepRecipe, Cloned, Instance); 2840 2841 // If we just cloned a new assumption, add it the assumption cache. 2842 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 2843 AC->registerAssumption(II); 2844 2845 // End if-block. 2846 if (IfPredicateInstr) 2847 PredicatedInstructions.push_back(Cloned); 2848 } 2849 2850 Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) { 2851 if (TripCount) 2852 return TripCount; 2853 2854 assert(InsertBlock); 2855 IRBuilder<> Builder(InsertBlock->getTerminator()); 2856 // Find the loop boundaries. 2857 Type *IdxTy = Legal->getWidestInductionType(); 2858 assert(IdxTy && "No type for induction"); 2859 const SCEV *ExitCount = createTripCountSCEV(IdxTy, PSE); 2860 2861 const DataLayout &DL = InsertBlock->getModule()->getDataLayout(); 2862 2863 // Expand the trip count and place the new instructions in the preheader. 2864 // Notice that the pre-header does not change, only the loop body. 2865 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2866 2867 // Count holds the overall loop count (N). 2868 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2869 InsertBlock->getTerminator()); 2870 2871 if (TripCount->getType()->isPointerTy()) 2872 TripCount = 2873 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2874 InsertBlock->getTerminator()); 2875 2876 return TripCount; 2877 } 2878 2879 Value * 2880 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { 2881 if (VectorTripCount) 2882 return VectorTripCount; 2883 2884 Value *TC = getOrCreateTripCount(InsertBlock); 2885 IRBuilder<> Builder(InsertBlock->getTerminator()); 2886 2887 Type *Ty = TC->getType(); 2888 // This is where we can make the step a runtime constant. 2889 Value *Step = createStepForVF(Builder, Ty, VF, UF); 2890 2891 // If the tail is to be folded by masking, round the number of iterations N 2892 // up to a multiple of Step instead of rounding down. This is done by first 2893 // adding Step-1 and then rounding down. Note that it's ok if this addition 2894 // overflows: the vector induction variable will eventually wrap to zero given 2895 // that it starts at zero and its Step is a power of two; the loop will then 2896 // exit, with the last early-exit vector comparison also producing all-true. 2897 // For scalable vectors the VF is not guaranteed to be a power of 2, but this 2898 // is accounted for in emitIterationCountCheck that adds an overflow check. 2899 if (Cost->foldTailByMasking()) { 2900 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 2901 "VF*UF must be a power of 2 when folding tail by masking"); 2902 Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF); 2903 TC = Builder.CreateAdd( 2904 TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up"); 2905 } 2906 2907 // Now we need to generate the expression for the part of the loop that the 2908 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2909 // iterations are not required for correctness, or N - Step, otherwise. Step 2910 // is equal to the vectorization factor (number of SIMD elements) times the 2911 // unroll factor (number of SIMD instructions). 2912 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2913 2914 // There are cases where we *must* run at least one iteration in the remainder 2915 // loop. See the cost model for when this can happen. If the step evenly 2916 // divides the trip count, we set the remainder to be equal to the step. If 2917 // the step does not evenly divide the trip count, no adjustment is necessary 2918 // since there will already be scalar iterations. Note that the minimum 2919 // iterations check ensures that N >= Step. 2920 if (Cost->requiresScalarEpilogue(VF)) { 2921 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2922 R = Builder.CreateSelect(IsZero, Step, R); 2923 } 2924 2925 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2926 2927 return VectorTripCount; 2928 } 2929 2930 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2931 const DataLayout &DL) { 2932 // Verify that V is a vector type with same number of elements as DstVTy. 2933 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 2934 unsigned VF = DstFVTy->getNumElements(); 2935 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 2936 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 2937 Type *SrcElemTy = SrcVecTy->getElementType(); 2938 Type *DstElemTy = DstFVTy->getElementType(); 2939 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2940 "Vector elements must have same size"); 2941 2942 // Do a direct cast if element types are castable. 2943 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2944 return Builder.CreateBitOrPointerCast(V, DstFVTy); 2945 } 2946 // V cannot be directly casted to desired vector type. 2947 // May happen when V is a floating point vector but DstVTy is a vector of 2948 // pointers or vice-versa. Handle this using a two-step bitcast using an 2949 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2950 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2951 "Only one type should be a pointer type"); 2952 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2953 "Only one type should be a floating point type"); 2954 Type *IntTy = 2955 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2956 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 2957 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2958 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 2959 } 2960 2961 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { 2962 Value *Count = getOrCreateTripCount(LoopVectorPreHeader); 2963 // Reuse existing vector loop preheader for TC checks. 2964 // Note that new preheader block is generated for vector loop. 2965 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2966 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2967 2968 // Generate code to check if the loop's trip count is less than VF * UF, or 2969 // equal to it in case a scalar epilogue is required; this implies that the 2970 // vector trip count is zero. This check also covers the case where adding one 2971 // to the backedge-taken count overflowed leading to an incorrect trip count 2972 // of zero. In this case we will also jump to the scalar loop. 2973 auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE 2974 : ICmpInst::ICMP_ULT; 2975 2976 // If tail is to be folded, vector loop takes care of all iterations. 2977 Type *CountTy = Count->getType(); 2978 Value *CheckMinIters = Builder.getFalse(); 2979 auto CreateStep = [&]() -> Value * { 2980 // Create step with max(MinProTripCount, UF * VF). 2981 if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue()) 2982 return createStepForVF(Builder, CountTy, VF, UF); 2983 2984 Value *MinProfTC = 2985 createStepForVF(Builder, CountTy, MinProfitableTripCount, 1); 2986 if (!VF.isScalable()) 2987 return MinProfTC; 2988 return Builder.CreateBinaryIntrinsic( 2989 Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF)); 2990 }; 2991 2992 if (!Cost->foldTailByMasking()) 2993 CheckMinIters = 2994 Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check"); 2995 else if (VF.isScalable()) { 2996 // vscale is not necessarily a power-of-2, which means we cannot guarantee 2997 // an overflow to zero when updating induction variables and so an 2998 // additional overflow check is required before entering the vector loop. 2999 3000 // Get the maximum unsigned value for the type. 3001 Value *MaxUIntTripCount = 3002 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask()); 3003 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count); 3004 3005 // Don't execute the vector loop if (UMax - n) < (VF * UF). 3006 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep()); 3007 } 3008 3009 // Create new preheader for vector loop. 3010 LoopVectorPreHeader = 3011 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3012 "vector.ph"); 3013 3014 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3015 DT->getNode(Bypass)->getIDom()) && 3016 "TC check is expected to dominate Bypass"); 3017 3018 // Update dominator for Bypass & LoopExit (if needed). 3019 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3020 if (!Cost->requiresScalarEpilogue(VF)) 3021 // If there is an epilogue which must run, there's no edge from the 3022 // middle block to exit blocks and thus no need to update the immediate 3023 // dominator of the exit blocks. 3024 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3025 3026 ReplaceInstWithInst( 3027 TCCheckBlock->getTerminator(), 3028 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3029 LoopBypassBlocks.push_back(TCCheckBlock); 3030 } 3031 3032 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { 3033 BasicBlock *const SCEVCheckBlock = 3034 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock); 3035 if (!SCEVCheckBlock) 3036 return nullptr; 3037 3038 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3039 (OptForSizeBasedOnProfile && 3040 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3041 "Cannot SCEV check stride or overflow when optimizing for size"); 3042 3043 3044 // Update dominator only if this is first RT check. 3045 if (LoopBypassBlocks.empty()) { 3046 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3047 if (!Cost->requiresScalarEpilogue(VF)) 3048 // If there is an epilogue which must run, there's no edge from the 3049 // middle block to exit blocks and thus no need to update the immediate 3050 // dominator of the exit blocks. 3051 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3052 } 3053 3054 LoopBypassBlocks.push_back(SCEVCheckBlock); 3055 AddedSafetyChecks = true; 3056 return SCEVCheckBlock; 3057 } 3058 3059 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) { 3060 // VPlan-native path does not do any analysis for runtime checks currently. 3061 if (EnableVPlanNativePath) 3062 return nullptr; 3063 3064 BasicBlock *const MemCheckBlock = 3065 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader); 3066 3067 // Check if we generated code that checks in runtime if arrays overlap. We put 3068 // the checks into a separate block to make the more common case of few 3069 // elements faster. 3070 if (!MemCheckBlock) 3071 return nullptr; 3072 3073 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3074 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3075 "Cannot emit memory checks when optimizing for size, unless forced " 3076 "to vectorize."); 3077 ORE->emit([&]() { 3078 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3079 OrigLoop->getStartLoc(), 3080 OrigLoop->getHeader()) 3081 << "Code-size may be reduced by not forcing " 3082 "vectorization, or by source-code modifications " 3083 "eliminating the need for runtime checks " 3084 "(e.g., adding 'restrict')."; 3085 }); 3086 } 3087 3088 LoopBypassBlocks.push_back(MemCheckBlock); 3089 3090 AddedSafetyChecks = true; 3091 3092 return MemCheckBlock; 3093 } 3094 3095 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3096 LoopScalarBody = OrigLoop->getHeader(); 3097 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3098 assert(LoopVectorPreHeader && "Invalid loop structure"); 3099 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr 3100 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && 3101 "multiple exit loop without required epilogue?"); 3102 3103 LoopMiddleBlock = 3104 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3105 LI, nullptr, Twine(Prefix) + "middle.block"); 3106 LoopScalarPreHeader = 3107 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3108 nullptr, Twine(Prefix) + "scalar.ph"); 3109 3110 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3111 3112 // Set up the middle block terminator. Two cases: 3113 // 1) If we know that we must execute the scalar epilogue, emit an 3114 // unconditional branch. 3115 // 2) Otherwise, we must have a single unique exit block (due to how we 3116 // implement the multiple exit case). In this case, set up a conditional 3117 // branch from the middle block to the loop scalar preheader, and the 3118 // exit block. completeLoopSkeleton will update the condition to use an 3119 // iteration check, if required to decide whether to execute the remainder. 3120 BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ? 3121 BranchInst::Create(LoopScalarPreHeader) : 3122 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, 3123 Builder.getTrue()); 3124 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3125 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3126 3127 // Update dominator for loop exit. During skeleton creation, only the vector 3128 // pre-header and the middle block are created. The vector loop is entirely 3129 // created during VPlan exection. 3130 if (!Cost->requiresScalarEpilogue(VF)) 3131 // If there is an epilogue which must run, there's no edge from the 3132 // middle block to exit blocks and thus no need to update the immediate 3133 // dominator of the exit blocks. 3134 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3135 } 3136 3137 PHINode *InnerLoopVectorizer::createInductionResumeValue( 3138 PHINode *OrigPhi, const InductionDescriptor &II, 3139 ArrayRef<BasicBlock *> BypassBlocks, 3140 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3141 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 3142 assert(VectorTripCount && "Expected valid arguments"); 3143 3144 Instruction *OldInduction = Legal->getPrimaryInduction(); 3145 Value *&EndValue = IVEndValues[OrigPhi]; 3146 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3147 if (OrigPhi == OldInduction) { 3148 // We know what the end value is. 3149 EndValue = VectorTripCount; 3150 } else { 3151 IRBuilder<> B(LoopVectorPreHeader->getTerminator()); 3152 3153 // Fast-math-flags propagate from the original induction instruction. 3154 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3155 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3156 3157 Value *Step = 3158 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); 3159 EndValue = 3160 emitTransformedIndex(B, VectorTripCount, II.getStartValue(), Step, II); 3161 EndValue->setName("ind.end"); 3162 3163 // Compute the end value for the additional bypass (if applicable). 3164 if (AdditionalBypass.first) { 3165 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3166 Value *Step = 3167 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); 3168 EndValueFromAdditionalBypass = emitTransformedIndex( 3169 B, AdditionalBypass.second, II.getStartValue(), Step, II); 3170 EndValueFromAdditionalBypass->setName("ind.end"); 3171 } 3172 } 3173 3174 // Create phi nodes to merge from the backedge-taken check block. 3175 PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3176 LoopScalarPreHeader->getTerminator()); 3177 // Copy original phi DL over to the new one. 3178 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3179 3180 // The new PHI merges the original incoming value, in case of a bypass, 3181 // or the value at the end of the vectorized loop. 3182 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3183 3184 // Fix the scalar body counter (PHI node). 3185 // The old induction's phi node in the scalar body needs the truncated 3186 // value. 3187 for (BasicBlock *BB : BypassBlocks) 3188 BCResumeVal->addIncoming(II.getStartValue(), BB); 3189 3190 if (AdditionalBypass.first) 3191 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3192 EndValueFromAdditionalBypass); 3193 return BCResumeVal; 3194 } 3195 3196 void InnerLoopVectorizer::createInductionResumeValues( 3197 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3198 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3199 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3200 "Inconsistent information about additional bypass."); 3201 // We are going to resume the execution of the scalar loop. 3202 // Go over all of the induction variables that we found and fix the 3203 // PHIs that are left in the scalar version of the loop. 3204 // The starting values of PHI nodes depend on the counter of the last 3205 // iteration in the vectorized loop. 3206 // If we come from a bypass edge then we need to start from the original 3207 // start value. 3208 for (const auto &InductionEntry : Legal->getInductionVars()) { 3209 PHINode *OrigPhi = InductionEntry.first; 3210 const InductionDescriptor &II = InductionEntry.second; 3211 PHINode *BCResumeVal = createInductionResumeValue( 3212 OrigPhi, II, LoopBypassBlocks, AdditionalBypass); 3213 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3214 } 3215 } 3216 3217 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() { 3218 // The trip counts should be cached by now. 3219 Value *Count = getOrCreateTripCount(LoopVectorPreHeader); 3220 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 3221 3222 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3223 3224 // Add a check in the middle block to see if we have completed 3225 // all of the iterations in the first vector loop. Three cases: 3226 // 1) If we require a scalar epilogue, there is no conditional branch as 3227 // we unconditionally branch to the scalar preheader. Do nothing. 3228 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. 3229 // Thus if tail is to be folded, we know we don't need to run the 3230 // remainder and we can use the previous value for the condition (true). 3231 // 3) Otherwise, construct a runtime check. 3232 if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) { 3233 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3234 Count, VectorTripCount, "cmp.n", 3235 LoopMiddleBlock->getTerminator()); 3236 3237 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3238 // of the corresponding compare because they may have ended up with 3239 // different line numbers and we want to avoid awkward line stepping while 3240 // debugging. Eg. if the compare has got a line number inside the loop. 3241 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3242 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3243 } 3244 3245 #ifdef EXPENSIVE_CHECKS 3246 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3247 #endif 3248 3249 return LoopVectorPreHeader; 3250 } 3251 3252 std::pair<BasicBlock *, Value *> 3253 InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3254 /* 3255 In this function we generate a new loop. The new loop will contain 3256 the vectorized instructions while the old loop will continue to run the 3257 scalar remainder. 3258 3259 [ ] <-- loop iteration number check. 3260 / | 3261 / v 3262 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3263 | / | 3264 | / v 3265 || [ ] <-- vector pre header. 3266 |/ | 3267 | v 3268 | [ ] \ 3269 | [ ]_| <-- vector loop (created during VPlan execution). 3270 | | 3271 | v 3272 \ -[ ] <--- middle-block. 3273 \/ | 3274 /\ v 3275 | ->[ ] <--- new preheader. 3276 | | 3277 (opt) v <-- edge from middle to exit iff epilogue is not required. 3278 | [ ] \ 3279 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). 3280 \ | 3281 \ v 3282 >[ ] <-- exit block(s). 3283 ... 3284 */ 3285 3286 // Create an empty vector loop, and prepare basic blocks for the runtime 3287 // checks. 3288 createVectorLoopSkeleton(""); 3289 3290 // Now, compare the new count to zero. If it is zero skip the vector loop and 3291 // jump to the scalar loop. This check also covers the case where the 3292 // backedge-taken count is uint##_max: adding one to it will overflow leading 3293 // to an incorrect trip count of zero. In this (rare) case we will also jump 3294 // to the scalar loop. 3295 emitIterationCountCheck(LoopScalarPreHeader); 3296 3297 // Generate the code to check any assumptions that we've made for SCEV 3298 // expressions. 3299 emitSCEVChecks(LoopScalarPreHeader); 3300 3301 // Generate the code that checks in runtime if arrays overlap. We put the 3302 // checks into a separate block to make the more common case of few elements 3303 // faster. 3304 emitMemRuntimeChecks(LoopScalarPreHeader); 3305 3306 // Emit phis for the new starting index of the scalar loop. 3307 createInductionResumeValues(); 3308 3309 return {completeLoopSkeleton(), nullptr}; 3310 } 3311 3312 // Fix up external users of the induction variable. At this point, we are 3313 // in LCSSA form, with all external PHIs that use the IV having one input value, 3314 // coming from the remainder loop. We need those PHIs to also have a correct 3315 // value for the IV when arriving directly from the middle block. 3316 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3317 const InductionDescriptor &II, 3318 Value *VectorTripCount, Value *EndValue, 3319 BasicBlock *MiddleBlock, 3320 BasicBlock *VectorHeader, VPlan &Plan) { 3321 // There are two kinds of external IV usages - those that use the value 3322 // computed in the last iteration (the PHI) and those that use the penultimate 3323 // value (the value that feeds into the phi from the loop latch). 3324 // We allow both, but they, obviously, have different values. 3325 3326 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3327 3328 DenseMap<Value *, Value *> MissingVals; 3329 3330 // An external user of the last iteration's value should see the value that 3331 // the remainder loop uses to initialize its own IV. 3332 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3333 for (User *U : PostInc->users()) { 3334 Instruction *UI = cast<Instruction>(U); 3335 if (!OrigLoop->contains(UI)) { 3336 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3337 MissingVals[UI] = EndValue; 3338 } 3339 } 3340 3341 // An external user of the penultimate value need to see EndValue - Step. 3342 // The simplest way to get this is to recompute it from the constituent SCEVs, 3343 // that is Start + (Step * (CRD - 1)). 3344 for (User *U : OrigPhi->users()) { 3345 auto *UI = cast<Instruction>(U); 3346 if (!OrigLoop->contains(UI)) { 3347 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3348 3349 IRBuilder<> B(MiddleBlock->getTerminator()); 3350 3351 // Fast-math-flags propagate from the original induction instruction. 3352 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3353 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3354 3355 Value *CountMinusOne = B.CreateSub( 3356 VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1)); 3357 CountMinusOne->setName("cmo"); 3358 Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(), 3359 VectorHeader->getTerminator()); 3360 Value *Escape = 3361 emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step, II); 3362 Escape->setName("ind.escape"); 3363 MissingVals[UI] = Escape; 3364 } 3365 } 3366 3367 for (auto &I : MissingVals) { 3368 PHINode *PHI = cast<PHINode>(I.first); 3369 // One corner case we have to handle is two IVs "chasing" each-other, 3370 // that is %IV2 = phi [...], [ %IV1, %latch ] 3371 // In this case, if IV1 has an external use, we need to avoid adding both 3372 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3373 // don't already have an incoming value for the middle block. 3374 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) { 3375 PHI->addIncoming(I.second, MiddleBlock); 3376 Plan.removeLiveOut(PHI); 3377 } 3378 } 3379 } 3380 3381 namespace { 3382 3383 struct CSEDenseMapInfo { 3384 static bool canHandle(const Instruction *I) { 3385 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3386 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3387 } 3388 3389 static inline Instruction *getEmptyKey() { 3390 return DenseMapInfo<Instruction *>::getEmptyKey(); 3391 } 3392 3393 static inline Instruction *getTombstoneKey() { 3394 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3395 } 3396 3397 static unsigned getHashValue(const Instruction *I) { 3398 assert(canHandle(I) && "Unknown instruction!"); 3399 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3400 I->value_op_end())); 3401 } 3402 3403 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3404 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3405 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3406 return LHS == RHS; 3407 return LHS->isIdenticalTo(RHS); 3408 } 3409 }; 3410 3411 } // end anonymous namespace 3412 3413 ///Perform cse of induction variable instructions. 3414 static void cse(BasicBlock *BB) { 3415 // Perform simple cse. 3416 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3417 for (Instruction &In : llvm::make_early_inc_range(*BB)) { 3418 if (!CSEDenseMapInfo::canHandle(&In)) 3419 continue; 3420 3421 // Check if we can replace this instruction with any of the 3422 // visited instructions. 3423 if (Instruction *V = CSEMap.lookup(&In)) { 3424 In.replaceAllUsesWith(V); 3425 In.eraseFromParent(); 3426 continue; 3427 } 3428 3429 CSEMap[&In] = &In; 3430 } 3431 } 3432 3433 InstructionCost 3434 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3435 bool &NeedToScalarize) const { 3436 Function *F = CI->getCalledFunction(); 3437 Type *ScalarRetTy = CI->getType(); 3438 SmallVector<Type *, 4> Tys, ScalarTys; 3439 for (auto &ArgOp : CI->args()) 3440 ScalarTys.push_back(ArgOp->getType()); 3441 3442 // Estimate cost of scalarized vector call. The source operands are assumed 3443 // to be vectors, so we need to extract individual elements from there, 3444 // execute VF scalar calls, and then gather the result into the vector return 3445 // value. 3446 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 3447 InstructionCost ScalarCallCost = 3448 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, CostKind); 3449 if (VF.isScalar()) 3450 return ScalarCallCost; 3451 3452 // Compute corresponding vector type for return value and arguments. 3453 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3454 for (Type *ScalarTy : ScalarTys) 3455 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3456 3457 // Compute costs of unpacking argument values for the scalar calls and 3458 // packing the return values to a vector. 3459 InstructionCost ScalarizationCost = 3460 getScalarizationOverhead(CI, VF, CostKind); 3461 3462 InstructionCost Cost = 3463 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3464 3465 // If we can't emit a vector call for this function, then the currently found 3466 // cost is the cost we need to return. 3467 NeedToScalarize = true; 3468 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3469 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3470 3471 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3472 return Cost; 3473 3474 // If the corresponding vector cost is cheaper, return its cost. 3475 InstructionCost VectorCallCost = 3476 TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind); 3477 if (VectorCallCost < Cost) { 3478 NeedToScalarize = false; 3479 Cost = VectorCallCost; 3480 } 3481 return Cost; 3482 } 3483 3484 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3485 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3486 return Elt; 3487 return VectorType::get(Elt, VF); 3488 } 3489 3490 InstructionCost 3491 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3492 ElementCount VF) const { 3493 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3494 assert(ID && "Expected intrinsic call!"); 3495 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3496 FastMathFlags FMF; 3497 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3498 FMF = FPMO->getFastMathFlags(); 3499 3500 SmallVector<const Value *> Arguments(CI->args()); 3501 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3502 SmallVector<Type *> ParamTys; 3503 std::transform(FTy->param_begin(), FTy->param_end(), 3504 std::back_inserter(ParamTys), 3505 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3506 3507 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3508 dyn_cast<IntrinsicInst>(CI)); 3509 return TTI.getIntrinsicInstrCost(CostAttrs, 3510 TargetTransformInfo::TCK_RecipThroughput); 3511 } 3512 3513 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3514 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3515 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3516 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3517 } 3518 3519 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3520 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3521 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3522 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3523 } 3524 3525 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3526 // For every instruction `I` in MinBWs, truncate the operands, create a 3527 // truncated version of `I` and reextend its result. InstCombine runs 3528 // later and will remove any ext/trunc pairs. 3529 SmallPtrSet<Value *, 4> Erased; 3530 for (const auto &KV : Cost->getMinimalBitwidths()) { 3531 // If the value wasn't vectorized, we must maintain the original scalar 3532 // type. The absence of the value from State indicates that it 3533 // wasn't vectorized. 3534 // FIXME: Should not rely on getVPValue at this point. 3535 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3536 if (!State.hasAnyVectorValue(Def)) 3537 continue; 3538 for (unsigned Part = 0; Part < UF; ++Part) { 3539 Value *I = State.get(Def, Part); 3540 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3541 continue; 3542 Type *OriginalTy = I->getType(); 3543 Type *ScalarTruncatedTy = 3544 IntegerType::get(OriginalTy->getContext(), KV.second); 3545 auto *TruncatedTy = VectorType::get( 3546 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount()); 3547 if (TruncatedTy == OriginalTy) 3548 continue; 3549 3550 IRBuilder<> B(cast<Instruction>(I)); 3551 auto ShrinkOperand = [&](Value *V) -> Value * { 3552 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3553 if (ZI->getSrcTy() == TruncatedTy) 3554 return ZI->getOperand(0); 3555 return B.CreateZExtOrTrunc(V, TruncatedTy); 3556 }; 3557 3558 // The actual instruction modification depends on the instruction type, 3559 // unfortunately. 3560 Value *NewI = nullptr; 3561 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3562 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3563 ShrinkOperand(BO->getOperand(1))); 3564 3565 // Any wrapping introduced by shrinking this operation shouldn't be 3566 // considered undefined behavior. So, we can't unconditionally copy 3567 // arithmetic wrapping flags to NewI. 3568 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3569 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3570 NewI = 3571 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3572 ShrinkOperand(CI->getOperand(1))); 3573 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3574 NewI = B.CreateSelect(SI->getCondition(), 3575 ShrinkOperand(SI->getTrueValue()), 3576 ShrinkOperand(SI->getFalseValue())); 3577 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3578 switch (CI->getOpcode()) { 3579 default: 3580 llvm_unreachable("Unhandled cast!"); 3581 case Instruction::Trunc: 3582 NewI = ShrinkOperand(CI->getOperand(0)); 3583 break; 3584 case Instruction::SExt: 3585 NewI = B.CreateSExtOrTrunc( 3586 CI->getOperand(0), 3587 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3588 break; 3589 case Instruction::ZExt: 3590 NewI = B.CreateZExtOrTrunc( 3591 CI->getOperand(0), 3592 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3593 break; 3594 } 3595 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3596 auto Elements0 = 3597 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount(); 3598 auto *O0 = B.CreateZExtOrTrunc( 3599 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3600 auto Elements1 = 3601 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount(); 3602 auto *O1 = B.CreateZExtOrTrunc( 3603 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3604 3605 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3606 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3607 // Don't do anything with the operands, just extend the result. 3608 continue; 3609 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3610 auto Elements = 3611 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount(); 3612 auto *O0 = B.CreateZExtOrTrunc( 3613 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3614 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3615 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3616 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3617 auto Elements = 3618 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount(); 3619 auto *O0 = B.CreateZExtOrTrunc( 3620 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3621 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3622 } else { 3623 // If we don't know what to do, be conservative and don't do anything. 3624 continue; 3625 } 3626 3627 // Lastly, extend the result. 3628 NewI->takeName(cast<Instruction>(I)); 3629 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3630 I->replaceAllUsesWith(Res); 3631 cast<Instruction>(I)->eraseFromParent(); 3632 Erased.insert(I); 3633 State.reset(Def, Res, Part); 3634 } 3635 } 3636 3637 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3638 for (const auto &KV : Cost->getMinimalBitwidths()) { 3639 // If the value wasn't vectorized, we must maintain the original scalar 3640 // type. The absence of the value from State indicates that it 3641 // wasn't vectorized. 3642 // FIXME: Should not rely on getVPValue at this point. 3643 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3644 if (!State.hasAnyVectorValue(Def)) 3645 continue; 3646 for (unsigned Part = 0; Part < UF; ++Part) { 3647 Value *I = State.get(Def, Part); 3648 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3649 if (Inst && Inst->use_empty()) { 3650 Value *NewI = Inst->getOperand(0); 3651 Inst->eraseFromParent(); 3652 State.reset(Def, NewI, Part); 3653 } 3654 } 3655 } 3656 } 3657 3658 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, 3659 VPlan &Plan) { 3660 // Insert truncates and extends for any truncated instructions as hints to 3661 // InstCombine. 3662 if (VF.isVector()) 3663 truncateToMinimalBitwidths(State); 3664 3665 // Fix widened non-induction PHIs by setting up the PHI operands. 3666 if (EnableVPlanNativePath) 3667 fixNonInductionPHIs(Plan, State); 3668 3669 // At this point every instruction in the original loop is widened to a 3670 // vector form. Now we need to fix the recurrences in the loop. These PHI 3671 // nodes are currently empty because we did not want to introduce cycles. 3672 // This is the second stage of vectorizing recurrences. 3673 fixCrossIterationPHIs(State); 3674 3675 // Forget the original basic block. 3676 PSE.getSE()->forgetLoop(OrigLoop); 3677 3678 VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock(); 3679 Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]); 3680 if (Cost->requiresScalarEpilogue(VF)) { 3681 // No edge from the middle block to the unique exit block has been inserted 3682 // and there is nothing to fix from vector loop; phis should have incoming 3683 // from scalar loop only. 3684 Plan.clearLiveOuts(); 3685 } else { 3686 // If we inserted an edge from the middle block to the unique exit block, 3687 // update uses outside the loop (phis) to account for the newly inserted 3688 // edge. 3689 3690 // Fix-up external users of the induction variables. 3691 for (const auto &Entry : Legal->getInductionVars()) 3692 fixupIVUsers(Entry.first, Entry.second, 3693 getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()), 3694 IVEndValues[Entry.first], LoopMiddleBlock, 3695 VectorLoop->getHeader(), Plan); 3696 } 3697 3698 // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated 3699 // in the exit block, so update the builder. 3700 State.Builder.SetInsertPoint(State.CFG.ExitBB->getFirstNonPHI()); 3701 for (const auto &KV : Plan.getLiveOuts()) 3702 KV.second->fixPhi(Plan, State); 3703 3704 for (Instruction *PI : PredicatedInstructions) 3705 sinkScalarOperands(&*PI); 3706 3707 // Remove redundant induction instructions. 3708 cse(VectorLoop->getHeader()); 3709 3710 // Set/update profile weights for the vector and remainder loops as original 3711 // loop iterations are now distributed among them. Note that original loop 3712 // represented by LoopScalarBody becomes remainder loop after vectorization. 3713 // 3714 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3715 // end up getting slightly roughened result but that should be OK since 3716 // profile is not inherently precise anyway. Note also possible bypass of 3717 // vector code caused by legality checks is ignored, assigning all the weight 3718 // to the vector loop, optimistically. 3719 // 3720 // For scalable vectorization we can't know at compile time how many iterations 3721 // of the loop are handled in one vector iteration, so instead assume a pessimistic 3722 // vscale of '1'. 3723 setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop, 3724 LI->getLoopFor(LoopScalarBody), 3725 VF.getKnownMinValue() * UF); 3726 } 3727 3728 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 3729 // In order to support recurrences we need to be able to vectorize Phi nodes. 3730 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3731 // stage #2: We now need to fix the recurrences by adding incoming edges to 3732 // the currently empty PHI nodes. At this point every instruction in the 3733 // original loop is widened to a vector form so we can use them to construct 3734 // the incoming edges. 3735 VPBasicBlock *Header = 3736 State.Plan->getVectorLoopRegion()->getEntryBasicBlock(); 3737 for (VPRecipeBase &R : Header->phis()) { 3738 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) 3739 fixReduction(ReductionPhi, State); 3740 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) 3741 fixFixedOrderRecurrence(FOR, State); 3742 } 3743 } 3744 3745 void InnerLoopVectorizer::fixFixedOrderRecurrence( 3746 VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) { 3747 // This is the second phase of vectorizing first-order recurrences. An 3748 // overview of the transformation is described below. Suppose we have the 3749 // following loop. 3750 // 3751 // for (int i = 0; i < n; ++i) 3752 // b[i] = a[i] - a[i - 1]; 3753 // 3754 // There is a first-order recurrence on "a". For this loop, the shorthand 3755 // scalar IR looks like: 3756 // 3757 // scalar.ph: 3758 // s_init = a[-1] 3759 // br scalar.body 3760 // 3761 // scalar.body: 3762 // i = phi [0, scalar.ph], [i+1, scalar.body] 3763 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3764 // s2 = a[i] 3765 // b[i] = s2 - s1 3766 // br cond, scalar.body, ... 3767 // 3768 // In this example, s1 is a recurrence because it's value depends on the 3769 // previous iteration. In the first phase of vectorization, we created a 3770 // vector phi v1 for s1. We now complete the vectorization and produce the 3771 // shorthand vector IR shown below (for VF = 4, UF = 1). 3772 // 3773 // vector.ph: 3774 // v_init = vector(..., ..., ..., a[-1]) 3775 // br vector.body 3776 // 3777 // vector.body 3778 // i = phi [0, vector.ph], [i+4, vector.body] 3779 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3780 // v2 = a[i, i+1, i+2, i+3]; 3781 // v3 = vector(v1(3), v2(0, 1, 2)) 3782 // b[i, i+1, i+2, i+3] = v2 - v3 3783 // br cond, vector.body, middle.block 3784 // 3785 // middle.block: 3786 // x = v2(3) 3787 // br scalar.ph 3788 // 3789 // scalar.ph: 3790 // s_init = phi [x, middle.block], [a[-1], otherwise] 3791 // br scalar.body 3792 // 3793 // After execution completes the vector loop, we extract the next value of 3794 // the recurrence (x) to use as the initial value in the scalar loop. 3795 3796 // Extract the last vector element in the middle block. This will be the 3797 // initial value for the recurrence when jumping to the scalar loop. 3798 VPValue *PreviousDef = PhiR->getBackedgeValue(); 3799 Value *Incoming = State.get(PreviousDef, UF - 1); 3800 auto *ExtractForScalar = Incoming; 3801 auto *IdxTy = Builder.getInt32Ty(); 3802 if (VF.isVector()) { 3803 auto *One = ConstantInt::get(IdxTy, 1); 3804 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3805 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 3806 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 3807 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx, 3808 "vector.recur.extract"); 3809 } 3810 // Extract the second last element in the middle block if the 3811 // Phi is used outside the loop. We need to extract the phi itself 3812 // and not the last element (the phi update in the current iteration). This 3813 // will be the value when jumping to the exit block from the LoopMiddleBlock, 3814 // when the scalar loop is not run at all. 3815 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3816 if (VF.isVector()) { 3817 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 3818 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 3819 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3820 Incoming, Idx, "vector.recur.extract.for.phi"); 3821 } else if (UF > 1) 3822 // When loop is unrolled without vectorizing, initialize 3823 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value 3824 // of `Incoming`. This is analogous to the vectorized case above: extracting 3825 // the second last element when VF > 1. 3826 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 3827 3828 // Fix the initial value of the original recurrence in the scalar loop. 3829 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 3830 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue()); 3831 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3832 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue(); 3833 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3834 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3835 Start->addIncoming(Incoming, BB); 3836 } 3837 3838 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3839 Phi->setName("scalar.recur"); 3840 3841 // Finally, fix users of the recurrence outside the loop. The users will need 3842 // either the last value of the scalar recurrence or the last value of the 3843 // vector recurrence we extracted in the middle block. Since the loop is in 3844 // LCSSA form, we just need to find all the phi nodes for the original scalar 3845 // recurrence in the exit block, and then add an edge for the middle block. 3846 // Note that LCSSA does not imply single entry when the original scalar loop 3847 // had multiple exiting edges (as we always run the last iteration in the 3848 // scalar epilogue); in that case, there is no edge from middle to exit and 3849 // and thus no phis which needed updated. 3850 if (!Cost->requiresScalarEpilogue(VF)) 3851 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 3852 if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) { 3853 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3854 State.Plan->removeLiveOut(&LCSSAPhi); 3855 } 3856 } 3857 3858 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, 3859 VPTransformState &State) { 3860 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 3861 // Get it's reduction variable descriptor. 3862 assert(Legal->isReductionVariable(OrigPhi) && 3863 "Unable to find the reduction variable"); 3864 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 3865 3866 RecurKind RK = RdxDesc.getRecurrenceKind(); 3867 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3868 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3869 State.setDebugLocFromInst(ReductionStartValue); 3870 3871 VPValue *LoopExitInstDef = PhiR->getBackedgeValue(); 3872 // This is the vector-clone of the value that leaves the loop. 3873 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 3874 3875 // Wrap flags are in general invalid after vectorization, clear them. 3876 clearReductionWrapFlags(PhiR, State); 3877 3878 // Before each round, move the insertion point right between 3879 // the PHIs and the values we are going to write. 3880 // This allows us to write both PHINodes and the extractelement 3881 // instructions. 3882 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3883 3884 State.setDebugLocFromInst(LoopExitInst); 3885 3886 Type *PhiTy = OrigPhi->getType(); 3887 3888 VPBasicBlock *LatchVPBB = 3889 PhiR->getParent()->getEnclosingLoopRegion()->getExitingBasicBlock(); 3890 BasicBlock *VectorLoopLatch = State.CFG.VPBB2IRBB[LatchVPBB]; 3891 // If tail is folded by masking, the vector value to leave the loop should be 3892 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 3893 // instead of the former. For an inloop reduction the reduction will already 3894 // be predicated, and does not need to be handled here. 3895 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { 3896 for (unsigned Part = 0; Part < UF; ++Part) { 3897 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 3898 SelectInst *Sel = nullptr; 3899 for (User *U : VecLoopExitInst->users()) { 3900 if (isa<SelectInst>(U)) { 3901 assert(!Sel && "Reduction exit feeding two selects"); 3902 Sel = cast<SelectInst>(U); 3903 } else 3904 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 3905 } 3906 assert(Sel && "Reduction exit feeds no select"); 3907 State.reset(LoopExitInstDef, Sel, Part); 3908 3909 if (isa<FPMathOperator>(Sel)) 3910 Sel->setFastMathFlags(RdxDesc.getFastMathFlags()); 3911 3912 // If the target can create a predicated operator for the reduction at no 3913 // extra cost in the loop (for example a predicated vadd), it can be 3914 // cheaper for the select to remain in the loop than be sunk out of it, 3915 // and so use the select value for the phi instead of the old 3916 // LoopExitValue. 3917 if (PreferPredicatedReductionSelect || 3918 TTI->preferPredicatedReductionSelect( 3919 RdxDesc.getOpcode(), PhiTy, 3920 TargetTransformInfo::ReductionFlags())) { 3921 auto *VecRdxPhi = 3922 cast<PHINode>(State.get(PhiR, Part)); 3923 VecRdxPhi->setIncomingValueForBlock(VectorLoopLatch, Sel); 3924 } 3925 } 3926 } 3927 3928 // If the vector reduction can be performed in a smaller type, we truncate 3929 // then extend the loop exit value to enable InstCombine to evaluate the 3930 // entire expression in the smaller type. 3931 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 3932 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); 3933 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 3934 Builder.SetInsertPoint(VectorLoopLatch->getTerminator()); 3935 VectorParts RdxParts(UF); 3936 for (unsigned Part = 0; Part < UF; ++Part) { 3937 RdxParts[Part] = State.get(LoopExitInstDef, Part); 3938 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3939 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 3940 : Builder.CreateZExt(Trunc, VecTy); 3941 for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users())) 3942 if (U != Trunc) { 3943 U->replaceUsesOfWith(RdxParts[Part], Extnd); 3944 RdxParts[Part] = Extnd; 3945 } 3946 } 3947 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3948 for (unsigned Part = 0; Part < UF; ++Part) { 3949 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3950 State.reset(LoopExitInstDef, RdxParts[Part], Part); 3951 } 3952 } 3953 3954 // Reduce all of the unrolled parts into a single vector. 3955 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 3956 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 3957 3958 // The middle block terminator has already been assigned a DebugLoc here (the 3959 // OrigLoop's single latch terminator). We want the whole middle block to 3960 // appear to execute on this line because: (a) it is all compiler generated, 3961 // (b) these instructions are always executed after evaluating the latch 3962 // conditional branch, and (c) other passes may add new predecessors which 3963 // terminate on this line. This is the easiest way to ensure we don't 3964 // accidentally cause an extra step back into the loop while debugging. 3965 State.setDebugLocFromInst(LoopMiddleBlock->getTerminator()); 3966 if (PhiR->isOrdered()) 3967 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 3968 else { 3969 // Floating-point operations should have some FMF to enable the reduction. 3970 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 3971 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 3972 for (unsigned Part = 1; Part < UF; ++Part) { 3973 Value *RdxPart = State.get(LoopExitInstDef, Part); 3974 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 3975 ReducedPartRdx = Builder.CreateBinOp( 3976 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 3977 } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) 3978 ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK, 3979 ReducedPartRdx, RdxPart); 3980 else 3981 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 3982 } 3983 } 3984 3985 // Create the reduction after the loop. Note that inloop reductions create the 3986 // target reduction in the loop using a Reduction recipe. 3987 if (VF.isVector() && !PhiR->isInLoop()) { 3988 ReducedPartRdx = 3989 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi); 3990 // If the reduction can be performed in a smaller type, we need to extend 3991 // the reduction to the wider type before we branch to the original loop. 3992 if (PhiTy != RdxDesc.getRecurrenceType()) 3993 ReducedPartRdx = RdxDesc.isSigned() 3994 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 3995 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 3996 } 3997 3998 PHINode *ResumePhi = 3999 dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue()); 4000 4001 // Create a phi node that merges control-flow from the backedge-taken check 4002 // block and the middle block. 4003 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 4004 LoopScalarPreHeader->getTerminator()); 4005 4006 // If we are fixing reductions in the epilogue loop then we should already 4007 // have created a bc.merge.rdx Phi after the main vector body. Ensure that 4008 // we carry over the incoming values correctly. 4009 for (auto *Incoming : predecessors(LoopScalarPreHeader)) { 4010 if (Incoming == LoopMiddleBlock) 4011 BCBlockPhi->addIncoming(ReducedPartRdx, Incoming); 4012 else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming)) 4013 BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming), 4014 Incoming); 4015 else 4016 BCBlockPhi->addIncoming(ReductionStartValue, Incoming); 4017 } 4018 4019 // Set the resume value for this reduction 4020 ReductionResumeValues.insert({&RdxDesc, BCBlockPhi}); 4021 4022 // If there were stores of the reduction value to a uniform memory address 4023 // inside the loop, create the final store here. 4024 if (StoreInst *SI = RdxDesc.IntermediateStore) { 4025 StoreInst *NewSI = 4026 Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand()); 4027 propagateMetadata(NewSI, SI); 4028 4029 // If the reduction value is used in other places, 4030 // then let the code below create PHI's for that. 4031 } 4032 4033 // Now, we need to fix the users of the reduction variable 4034 // inside and outside of the scalar remainder loop. 4035 4036 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4037 // in the exit blocks. See comment on analogous loop in 4038 // fixFixedOrderRecurrence for a more complete explaination of the logic. 4039 if (!Cost->requiresScalarEpilogue(VF)) 4040 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4041 if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) { 4042 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4043 State.Plan->removeLiveOut(&LCSSAPhi); 4044 } 4045 4046 // Fix the scalar loop reduction variable with the incoming reduction sum 4047 // from the vector body and from the backedge value. 4048 int IncomingEdgeBlockIdx = 4049 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4050 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4051 // Pick the other block. 4052 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4053 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4054 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4055 } 4056 4057 void InnerLoopVectorizer::clearReductionWrapFlags(VPReductionPHIRecipe *PhiR, 4058 VPTransformState &State) { 4059 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 4060 RecurKind RK = RdxDesc.getRecurrenceKind(); 4061 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4062 return; 4063 4064 SmallVector<VPValue *, 8> Worklist; 4065 SmallPtrSet<VPValue *, 8> Visited; 4066 Worklist.push_back(PhiR); 4067 Visited.insert(PhiR); 4068 4069 while (!Worklist.empty()) { 4070 VPValue *Cur = Worklist.pop_back_val(); 4071 for (unsigned Part = 0; Part < UF; ++Part) { 4072 Value *V = State.get(Cur, Part); 4073 if (!isa<OverflowingBinaryOperator>(V)) 4074 break; 4075 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4076 } 4077 4078 for (VPUser *U : Cur->users()) { 4079 auto *UserRecipe = dyn_cast<VPRecipeBase>(U); 4080 if (!UserRecipe) 4081 continue; 4082 for (VPValue *V : UserRecipe->definedValues()) 4083 if (Visited.insert(V).second) 4084 Worklist.push_back(V); 4085 } 4086 } 4087 } 4088 4089 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4090 // The basic block and loop containing the predicated instruction. 4091 auto *PredBB = PredInst->getParent(); 4092 auto *VectorLoop = LI->getLoopFor(PredBB); 4093 4094 // Initialize a worklist with the operands of the predicated instruction. 4095 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4096 4097 // Holds instructions that we need to analyze again. An instruction may be 4098 // reanalyzed if we don't yet know if we can sink it or not. 4099 SmallVector<Instruction *, 8> InstsToReanalyze; 4100 4101 // Returns true if a given use occurs in the predicated block. Phi nodes use 4102 // their operands in their corresponding predecessor blocks. 4103 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4104 auto *I = cast<Instruction>(U.getUser()); 4105 BasicBlock *BB = I->getParent(); 4106 if (auto *Phi = dyn_cast<PHINode>(I)) 4107 BB = Phi->getIncomingBlock( 4108 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4109 return BB == PredBB; 4110 }; 4111 4112 // Iteratively sink the scalarized operands of the predicated instruction 4113 // into the block we created for it. When an instruction is sunk, it's 4114 // operands are then added to the worklist. The algorithm ends after one pass 4115 // through the worklist doesn't sink a single instruction. 4116 bool Changed; 4117 do { 4118 // Add the instructions that need to be reanalyzed to the worklist, and 4119 // reset the changed indicator. 4120 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4121 InstsToReanalyze.clear(); 4122 Changed = false; 4123 4124 while (!Worklist.empty()) { 4125 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4126 4127 // We can't sink an instruction if it is a phi node, is not in the loop, 4128 // or may have side effects. 4129 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 4130 I->mayHaveSideEffects()) 4131 continue; 4132 4133 // If the instruction is already in PredBB, check if we can sink its 4134 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 4135 // sinking the scalar instruction I, hence it appears in PredBB; but it 4136 // may have failed to sink I's operands (recursively), which we try 4137 // (again) here. 4138 if (I->getParent() == PredBB) { 4139 Worklist.insert(I->op_begin(), I->op_end()); 4140 continue; 4141 } 4142 4143 // It's legal to sink the instruction if all its uses occur in the 4144 // predicated block. Otherwise, there's nothing to do yet, and we may 4145 // need to reanalyze the instruction. 4146 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4147 InstsToReanalyze.push_back(I); 4148 continue; 4149 } 4150 4151 // Move the instruction to the beginning of the predicated block, and add 4152 // it's operands to the worklist. 4153 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4154 Worklist.insert(I->op_begin(), I->op_end()); 4155 4156 // The sinking may have enabled other instructions to be sunk, so we will 4157 // need to iterate. 4158 Changed = true; 4159 } 4160 } while (Changed); 4161 } 4162 4163 void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan, 4164 VPTransformState &State) { 4165 auto Iter = vp_depth_first_deep(Plan.getEntry()); 4166 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 4167 for (VPRecipeBase &P : VPBB->phis()) { 4168 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P); 4169 if (!VPPhi) 4170 continue; 4171 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4172 // Make sure the builder has a valid insert point. 4173 Builder.SetInsertPoint(NewPhi); 4174 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4175 VPValue *Inc = VPPhi->getIncomingValue(i); 4176 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4177 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4178 } 4179 } 4180 } 4181 } 4182 4183 bool InnerLoopVectorizer::useOrderedReductions( 4184 const RecurrenceDescriptor &RdxDesc) { 4185 return Cost->useOrderedReductions(RdxDesc); 4186 } 4187 4188 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4189 // We should not collect Scalars more than once per VF. Right now, this 4190 // function is called from collectUniformsAndScalars(), which already does 4191 // this check. Collecting Scalars for VF=1 does not make any sense. 4192 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 4193 "This function should not be visited twice for the same VF"); 4194 4195 // This avoids any chances of creating a REPLICATE recipe during planning 4196 // since that would result in generation of scalarized code during execution, 4197 // which is not supported for scalable vectors. 4198 if (VF.isScalable()) { 4199 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4200 return; 4201 } 4202 4203 SmallSetVector<Instruction *, 8> Worklist; 4204 4205 // These sets are used to seed the analysis with pointers used by memory 4206 // accesses that will remain scalar. 4207 SmallSetVector<Instruction *, 8> ScalarPtrs; 4208 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4209 auto *Latch = TheLoop->getLoopLatch(); 4210 4211 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4212 // The pointer operands of loads and stores will be scalar as long as the 4213 // memory access is not a gather or scatter operation. The value operand of a 4214 // store will remain scalar if the store is scalarized. 4215 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4216 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4217 assert(WideningDecision != CM_Unknown && 4218 "Widening decision should be ready at this moment"); 4219 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4220 if (Ptr == Store->getValueOperand()) 4221 return WideningDecision == CM_Scalarize; 4222 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4223 "Ptr is neither a value or pointer operand"); 4224 return WideningDecision != CM_GatherScatter; 4225 }; 4226 4227 // A helper that returns true if the given value is a bitcast or 4228 // getelementptr instruction contained in the loop. 4229 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4230 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4231 isa<GetElementPtrInst>(V)) && 4232 !TheLoop->isLoopInvariant(V); 4233 }; 4234 4235 // A helper that evaluates a memory access's use of a pointer. If the use will 4236 // be a scalar use and the pointer is only used by memory accesses, we place 4237 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4238 // PossibleNonScalarPtrs. 4239 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4240 // We only care about bitcast and getelementptr instructions contained in 4241 // the loop. 4242 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4243 return; 4244 4245 // If the pointer has already been identified as scalar (e.g., if it was 4246 // also identified as uniform), there's nothing to do. 4247 auto *I = cast<Instruction>(Ptr); 4248 if (Worklist.count(I)) 4249 return; 4250 4251 // If the use of the pointer will be a scalar use, and all users of the 4252 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4253 // place the pointer in PossibleNonScalarPtrs. 4254 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4255 return isa<LoadInst>(U) || isa<StoreInst>(U); 4256 })) 4257 ScalarPtrs.insert(I); 4258 else 4259 PossibleNonScalarPtrs.insert(I); 4260 }; 4261 4262 // We seed the scalars analysis with three classes of instructions: (1) 4263 // instructions marked uniform-after-vectorization and (2) bitcast, 4264 // getelementptr and (pointer) phi instructions used by memory accesses 4265 // requiring a scalar use. 4266 // 4267 // (1) Add to the worklist all instructions that have been identified as 4268 // uniform-after-vectorization. 4269 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4270 4271 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4272 // memory accesses requiring a scalar use. The pointer operands of loads and 4273 // stores will be scalar as long as the memory accesses is not a gather or 4274 // scatter operation. The value operand of a store will remain scalar if the 4275 // store is scalarized. 4276 for (auto *BB : TheLoop->blocks()) 4277 for (auto &I : *BB) { 4278 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4279 evaluatePtrUse(Load, Load->getPointerOperand()); 4280 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4281 evaluatePtrUse(Store, Store->getPointerOperand()); 4282 evaluatePtrUse(Store, Store->getValueOperand()); 4283 } 4284 } 4285 for (auto *I : ScalarPtrs) 4286 if (!PossibleNonScalarPtrs.count(I)) { 4287 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4288 Worklist.insert(I); 4289 } 4290 4291 // Insert the forced scalars. 4292 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector 4293 // induction variable when the PHI user is scalarized. 4294 auto ForcedScalar = ForcedScalars.find(VF); 4295 if (ForcedScalar != ForcedScalars.end()) 4296 for (auto *I : ForcedScalar->second) { 4297 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n"); 4298 Worklist.insert(I); 4299 } 4300 4301 // Expand the worklist by looking through any bitcasts and getelementptr 4302 // instructions we've already identified as scalar. This is similar to the 4303 // expansion step in collectLoopUniforms(); however, here we're only 4304 // expanding to include additional bitcasts and getelementptr instructions. 4305 unsigned Idx = 0; 4306 while (Idx != Worklist.size()) { 4307 Instruction *Dst = Worklist[Idx++]; 4308 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4309 continue; 4310 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4311 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4312 auto *J = cast<Instruction>(U); 4313 return !TheLoop->contains(J) || Worklist.count(J) || 4314 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4315 isScalarUse(J, Src)); 4316 })) { 4317 Worklist.insert(Src); 4318 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4319 } 4320 } 4321 4322 // An induction variable will remain scalar if all users of the induction 4323 // variable and induction variable update remain scalar. 4324 for (const auto &Induction : Legal->getInductionVars()) { 4325 auto *Ind = Induction.first; 4326 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4327 4328 // If tail-folding is applied, the primary induction variable will be used 4329 // to feed a vector compare. 4330 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 4331 continue; 4332 4333 // Returns true if \p Indvar is a pointer induction that is used directly by 4334 // load/store instruction \p I. 4335 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, 4336 Instruction *I) { 4337 return Induction.second.getKind() == 4338 InductionDescriptor::IK_PtrInduction && 4339 (isa<LoadInst>(I) || isa<StoreInst>(I)) && 4340 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar); 4341 }; 4342 4343 // Determine if all users of the induction variable are scalar after 4344 // vectorization. 4345 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4346 auto *I = cast<Instruction>(U); 4347 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4348 IsDirectLoadStoreFromPtrIndvar(Ind, I); 4349 }); 4350 if (!ScalarInd) 4351 continue; 4352 4353 // Determine if all users of the induction variable update instruction are 4354 // scalar after vectorization. 4355 auto ScalarIndUpdate = 4356 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4357 auto *I = cast<Instruction>(U); 4358 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4359 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I); 4360 }); 4361 if (!ScalarIndUpdate) 4362 continue; 4363 4364 // The induction variable and its update instruction will remain scalar. 4365 Worklist.insert(Ind); 4366 Worklist.insert(IndUpdate); 4367 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4368 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4369 << "\n"); 4370 } 4371 4372 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4373 } 4374 4375 bool LoopVectorizationCostModel::isScalarWithPredication( 4376 Instruction *I, ElementCount VF) const { 4377 if (!isPredicatedInst(I)) 4378 return false; 4379 4380 // Do we have a non-scalar lowering for this predicated 4381 // instruction? No - it is scalar with predication. 4382 switch(I->getOpcode()) { 4383 default: 4384 return true; 4385 case Instruction::Load: 4386 case Instruction::Store: { 4387 auto *Ptr = getLoadStorePointerOperand(I); 4388 auto *Ty = getLoadStoreType(I); 4389 Type *VTy = Ty; 4390 if (VF.isVector()) 4391 VTy = VectorType::get(Ty, VF); 4392 const Align Alignment = getLoadStoreAlignment(I); 4393 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4394 TTI.isLegalMaskedGather(VTy, Alignment)) 4395 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4396 TTI.isLegalMaskedScatter(VTy, Alignment)); 4397 } 4398 case Instruction::UDiv: 4399 case Instruction::SDiv: 4400 case Instruction::SRem: 4401 case Instruction::URem: { 4402 // We have the option to use the safe-divisor idiom to avoid predication. 4403 // The cost based decision here will always select safe-divisor for 4404 // scalable vectors as scalarization isn't legal. 4405 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF); 4406 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost); 4407 } 4408 } 4409 } 4410 4411 bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const { 4412 if (!blockNeedsPredicationForAnyReason(I->getParent())) 4413 return false; 4414 4415 // Can we prove this instruction is safe to unconditionally execute? 4416 // If not, we must use some form of predication. 4417 switch(I->getOpcode()) { 4418 default: 4419 return false; 4420 case Instruction::Load: 4421 case Instruction::Store: { 4422 if (!Legal->isMaskRequired(I)) 4423 return false; 4424 // When we know the load's address is loop invariant and the instruction 4425 // in the original scalar loop was unconditionally executed then we 4426 // don't need to mark it as a predicated instruction. Tail folding may 4427 // introduce additional predication, but we're guaranteed to always have 4428 // at least one active lane. We call Legal->blockNeedsPredication here 4429 // because it doesn't query tail-folding. For stores, we need to prove 4430 // both speculation safety (which follows from the same argument as loads), 4431 // but also must prove the value being stored is correct. The easiest 4432 // form of the later is to require that all values stored are the same. 4433 if (Legal->isUniformMemOp(*I) && 4434 (isa<LoadInst>(I) || 4435 (isa<StoreInst>(I) && 4436 TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) && 4437 !Legal->blockNeedsPredication(I->getParent())) 4438 return false; 4439 return true; 4440 } 4441 case Instruction::UDiv: 4442 case Instruction::SDiv: 4443 case Instruction::SRem: 4444 case Instruction::URem: 4445 // TODO: We can use the loop-preheader as context point here and get 4446 // context sensitive reasoning 4447 return !isSafeToSpeculativelyExecute(I); 4448 } 4449 } 4450 4451 std::pair<InstructionCost, InstructionCost> 4452 LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I, 4453 ElementCount VF) const { 4454 assert(I->getOpcode() == Instruction::UDiv || 4455 I->getOpcode() == Instruction::SDiv || 4456 I->getOpcode() == Instruction::SRem || 4457 I->getOpcode() == Instruction::URem); 4458 assert(!isSafeToSpeculativelyExecute(I)); 4459 4460 const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 4461 4462 // Scalarization isn't legal for scalable vector types 4463 InstructionCost ScalarizationCost = InstructionCost::getInvalid(); 4464 if (!VF.isScalable()) { 4465 // Get the scalarization cost and scale this amount by the probability of 4466 // executing the predicated block. If the instruction is not predicated, 4467 // we fall through to the next case. 4468 ScalarizationCost = 0; 4469 4470 // These instructions have a non-void type, so account for the phi nodes 4471 // that we will create. This cost is likely to be zero. The phi node 4472 // cost, if any, should be scaled by the block probability because it 4473 // models a copy at the end of each predicated block. 4474 ScalarizationCost += VF.getKnownMinValue() * 4475 TTI.getCFInstrCost(Instruction::PHI, CostKind); 4476 4477 // The cost of the non-predicated instruction. 4478 ScalarizationCost += VF.getKnownMinValue() * 4479 TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind); 4480 4481 // The cost of insertelement and extractelement instructions needed for 4482 // scalarization. 4483 ScalarizationCost += getScalarizationOverhead(I, VF, CostKind); 4484 4485 // Scale the cost by the probability of executing the predicated blocks. 4486 // This assumes the predicated block for each vector lane is equally 4487 // likely. 4488 ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb(); 4489 } 4490 InstructionCost SafeDivisorCost = 0; 4491 4492 auto *VecTy = ToVectorTy(I->getType(), VF); 4493 4494 // The cost of the select guard to ensure all lanes are well defined 4495 // after we speculate above any internal control flow. 4496 SafeDivisorCost += TTI.getCmpSelInstrCost( 4497 Instruction::Select, VecTy, 4498 ToVectorTy(Type::getInt1Ty(I->getContext()), VF), 4499 CmpInst::BAD_ICMP_PREDICATE, CostKind); 4500 4501 // Certain instructions can be cheaper to vectorize if they have a constant 4502 // second vector operand. One example of this are shifts on x86. 4503 Value *Op2 = I->getOperand(1); 4504 auto Op2Info = TTI.getOperandInfo(Op2); 4505 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 4506 Op2Info.Kind = TargetTransformInfo::OK_UniformValue; 4507 4508 SmallVector<const Value *, 4> Operands(I->operand_values()); 4509 SafeDivisorCost += TTI.getArithmeticInstrCost( 4510 I->getOpcode(), VecTy, CostKind, 4511 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 4512 Op2Info, Operands, I); 4513 return {ScalarizationCost, SafeDivisorCost}; 4514 } 4515 4516 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 4517 Instruction *I, ElementCount VF) { 4518 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4519 assert(getWideningDecision(I, VF) == CM_Unknown && 4520 "Decision should not be set yet."); 4521 auto *Group = getInterleavedAccessGroup(I); 4522 assert(Group && "Must have a group."); 4523 4524 // If the instruction's allocated size doesn't equal it's type size, it 4525 // requires padding and will be scalarized. 4526 auto &DL = I->getModule()->getDataLayout(); 4527 auto *ScalarTy = getLoadStoreType(I); 4528 if (hasIrregularType(ScalarTy, DL)) 4529 return false; 4530 4531 // If the group involves a non-integral pointer, we may not be able to 4532 // losslessly cast all values to a common type. 4533 unsigned InterleaveFactor = Group->getFactor(); 4534 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy); 4535 for (unsigned i = 0; i < InterleaveFactor; i++) { 4536 Instruction *Member = Group->getMember(i); 4537 if (!Member) 4538 continue; 4539 auto *MemberTy = getLoadStoreType(Member); 4540 bool MemberNI = DL.isNonIntegralPointerType(MemberTy); 4541 // Don't coerce non-integral pointers to integers or vice versa. 4542 if (MemberNI != ScalarNI) { 4543 // TODO: Consider adding special nullptr value case here 4544 return false; 4545 } else if (MemberNI && ScalarNI && 4546 ScalarTy->getPointerAddressSpace() != 4547 MemberTy->getPointerAddressSpace()) { 4548 return false; 4549 } 4550 } 4551 4552 // Check if masking is required. 4553 // A Group may need masking for one of two reasons: it resides in a block that 4554 // needs predication, or it was decided to use masking to deal with gaps 4555 // (either a gap at the end of a load-access that may result in a speculative 4556 // load, or any gaps in a store-access). 4557 bool PredicatedAccessRequiresMasking = 4558 blockNeedsPredicationForAnyReason(I->getParent()) && 4559 Legal->isMaskRequired(I); 4560 bool LoadAccessWithGapsRequiresEpilogMasking = 4561 isa<LoadInst>(I) && Group->requiresScalarEpilogue() && 4562 !isScalarEpilogueAllowed(); 4563 bool StoreAccessWithGapsRequiresMasking = 4564 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); 4565 if (!PredicatedAccessRequiresMasking && 4566 !LoadAccessWithGapsRequiresEpilogMasking && 4567 !StoreAccessWithGapsRequiresMasking) 4568 return true; 4569 4570 // If masked interleaving is required, we expect that the user/target had 4571 // enabled it, because otherwise it either wouldn't have been created or 4572 // it should have been invalidated by the CostModel. 4573 assert(useMaskedInterleavedAccesses(TTI) && 4574 "Masked interleave-groups for predicated accesses are not enabled."); 4575 4576 if (Group->isReverse()) 4577 return false; 4578 4579 auto *Ty = getLoadStoreType(I); 4580 const Align Alignment = getLoadStoreAlignment(I); 4581 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4582 : TTI.isLegalMaskedStore(Ty, Alignment); 4583 } 4584 4585 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 4586 Instruction *I, ElementCount VF) { 4587 // Get and ensure we have a valid memory instruction. 4588 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction"); 4589 4590 auto *Ptr = getLoadStorePointerOperand(I); 4591 auto *ScalarTy = getLoadStoreType(I); 4592 4593 // In order to be widened, the pointer should be consecutive, first of all. 4594 if (!Legal->isConsecutivePtr(ScalarTy, Ptr)) 4595 return false; 4596 4597 // If the instruction is a store located in a predicated block, it will be 4598 // scalarized. 4599 if (isScalarWithPredication(I, VF)) 4600 return false; 4601 4602 // If the instruction's allocated size doesn't equal it's type size, it 4603 // requires padding and will be scalarized. 4604 auto &DL = I->getModule()->getDataLayout(); 4605 if (hasIrregularType(ScalarTy, DL)) 4606 return false; 4607 4608 return true; 4609 } 4610 4611 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 4612 // We should not collect Uniforms more than once per VF. Right now, 4613 // this function is called from collectUniformsAndScalars(), which 4614 // already does this check. Collecting Uniforms for VF=1 does not make any 4615 // sense. 4616 4617 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 4618 "This function should not be visited twice for the same VF"); 4619 4620 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4621 // not analyze again. Uniforms.count(VF) will return 1. 4622 Uniforms[VF].clear(); 4623 4624 // We now know that the loop is vectorizable! 4625 // Collect instructions inside the loop that will remain uniform after 4626 // vectorization. 4627 4628 // Global values, params and instructions outside of current loop are out of 4629 // scope. 4630 auto isOutOfScope = [&](Value *V) -> bool { 4631 Instruction *I = dyn_cast<Instruction>(V); 4632 return (!I || !TheLoop->contains(I)); 4633 }; 4634 4635 // Worklist containing uniform instructions demanding lane 0. 4636 SetVector<Instruction *> Worklist; 4637 BasicBlock *Latch = TheLoop->getLoopLatch(); 4638 4639 // Add uniform instructions demanding lane 0 to the worklist. Instructions 4640 // that are scalar with predication must not be considered uniform after 4641 // vectorization, because that would create an erroneous replicating region 4642 // where only a single instance out of VF should be formed. 4643 // TODO: optimize such seldom cases if found important, see PR40816. 4644 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 4645 if (isOutOfScope(I)) { 4646 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 4647 << *I << "\n"); 4648 return; 4649 } 4650 if (isScalarWithPredication(I, VF)) { 4651 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 4652 << *I << "\n"); 4653 return; 4654 } 4655 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 4656 Worklist.insert(I); 4657 }; 4658 4659 // Start with the conditional branch. If the branch condition is an 4660 // instruction contained in the loop that is only used by the branch, it is 4661 // uniform. 4662 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4663 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 4664 addToWorklistIfAllowed(Cmp); 4665 4666 // Return true if all lanes perform the same memory operation, and we can 4667 // thus chose to execute only one. 4668 auto isUniformMemOpUse = [&](Instruction *I) { 4669 if (!Legal->isUniformMemOp(*I)) 4670 return false; 4671 if (isa<LoadInst>(I)) 4672 // Loading the same address always produces the same result - at least 4673 // assuming aliasing and ordering which have already been checked. 4674 return true; 4675 // Storing the same value on every iteration. 4676 return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()); 4677 }; 4678 4679 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 4680 InstWidening WideningDecision = getWideningDecision(I, VF); 4681 assert(WideningDecision != CM_Unknown && 4682 "Widening decision should be ready at this moment"); 4683 4684 if (isUniformMemOpUse(I)) 4685 return true; 4686 4687 return (WideningDecision == CM_Widen || 4688 WideningDecision == CM_Widen_Reverse || 4689 WideningDecision == CM_Interleave); 4690 }; 4691 4692 // Returns true if Ptr is the pointer operand of a memory access instruction 4693 // I, I is known to not require scalarization, and the pointer is not also 4694 // stored. 4695 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 4696 auto GetStoredValue = [I]() -> Value * { 4697 if (!isa<StoreInst>(I)) 4698 return nullptr; 4699 return I->getOperand(0); 4700 }; 4701 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF) && 4702 GetStoredValue() != Ptr; 4703 }; 4704 4705 // Holds a list of values which are known to have at least one uniform use. 4706 // Note that there may be other uses which aren't uniform. A "uniform use" 4707 // here is something which only demands lane 0 of the unrolled iterations; 4708 // it does not imply that all lanes produce the same value (e.g. this is not 4709 // the usual meaning of uniform) 4710 SetVector<Value *> HasUniformUse; 4711 4712 // Scan the loop for instructions which are either a) known to have only 4713 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 4714 for (auto *BB : TheLoop->blocks()) 4715 for (auto &I : *BB) { 4716 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { 4717 switch (II->getIntrinsicID()) { 4718 case Intrinsic::sideeffect: 4719 case Intrinsic::experimental_noalias_scope_decl: 4720 case Intrinsic::assume: 4721 case Intrinsic::lifetime_start: 4722 case Intrinsic::lifetime_end: 4723 if (TheLoop->hasLoopInvariantOperands(&I)) 4724 addToWorklistIfAllowed(&I); 4725 break; 4726 default: 4727 break; 4728 } 4729 } 4730 4731 // ExtractValue instructions must be uniform, because the operands are 4732 // known to be loop-invariant. 4733 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { 4734 assert(isOutOfScope(EVI->getAggregateOperand()) && 4735 "Expected aggregate value to be loop invariant"); 4736 addToWorklistIfAllowed(EVI); 4737 continue; 4738 } 4739 4740 // If there's no pointer operand, there's nothing to do. 4741 auto *Ptr = getLoadStorePointerOperand(&I); 4742 if (!Ptr) 4743 continue; 4744 4745 if (isUniformMemOpUse(&I)) 4746 addToWorklistIfAllowed(&I); 4747 4748 if (isVectorizedMemAccessUse(&I, Ptr)) { 4749 assert(isUniformDecision(&I, VF) && "consistency check"); 4750 HasUniformUse.insert(Ptr); 4751 } 4752 } 4753 4754 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 4755 // demanding) users. Since loops are assumed to be in LCSSA form, this 4756 // disallows uses outside the loop as well. 4757 for (auto *V : HasUniformUse) { 4758 if (isOutOfScope(V)) 4759 continue; 4760 auto *I = cast<Instruction>(V); 4761 auto UsersAreMemAccesses = 4762 llvm::all_of(I->users(), [&](User *U) -> bool { 4763 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 4764 }); 4765 if (UsersAreMemAccesses) 4766 addToWorklistIfAllowed(I); 4767 } 4768 4769 // Expand Worklist in topological order: whenever a new instruction 4770 // is added , its users should be already inside Worklist. It ensures 4771 // a uniform instruction will only be used by uniform instructions. 4772 unsigned idx = 0; 4773 while (idx != Worklist.size()) { 4774 Instruction *I = Worklist[idx++]; 4775 4776 for (auto *OV : I->operand_values()) { 4777 // isOutOfScope operands cannot be uniform instructions. 4778 if (isOutOfScope(OV)) 4779 continue; 4780 // First order recurrence Phi's should typically be considered 4781 // non-uniform. 4782 auto *OP = dyn_cast<PHINode>(OV); 4783 if (OP && Legal->isFixedOrderRecurrence(OP)) 4784 continue; 4785 // If all the users of the operand are uniform, then add the 4786 // operand into the uniform worklist. 4787 auto *OI = cast<Instruction>(OV); 4788 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 4789 auto *J = cast<Instruction>(U); 4790 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 4791 })) 4792 addToWorklistIfAllowed(OI); 4793 } 4794 } 4795 4796 // For an instruction to be added into Worklist above, all its users inside 4797 // the loop should also be in Worklist. However, this condition cannot be 4798 // true for phi nodes that form a cyclic dependence. We must process phi 4799 // nodes separately. An induction variable will remain uniform if all users 4800 // of the induction variable and induction variable update remain uniform. 4801 // The code below handles both pointer and non-pointer induction variables. 4802 for (const auto &Induction : Legal->getInductionVars()) { 4803 auto *Ind = Induction.first; 4804 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4805 4806 // Determine if all users of the induction variable are uniform after 4807 // vectorization. 4808 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4809 auto *I = cast<Instruction>(U); 4810 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4811 isVectorizedMemAccessUse(I, Ind); 4812 }); 4813 if (!UniformInd) 4814 continue; 4815 4816 // Determine if all users of the induction variable update instruction are 4817 // uniform after vectorization. 4818 auto UniformIndUpdate = 4819 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4820 auto *I = cast<Instruction>(U); 4821 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4822 isVectorizedMemAccessUse(I, IndUpdate); 4823 }); 4824 if (!UniformIndUpdate) 4825 continue; 4826 4827 // The induction variable and its update instruction will remain uniform. 4828 addToWorklistIfAllowed(Ind); 4829 addToWorklistIfAllowed(IndUpdate); 4830 } 4831 4832 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 4833 } 4834 4835 bool LoopVectorizationCostModel::runtimeChecksRequired() { 4836 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 4837 4838 if (Legal->getRuntimePointerChecking()->Need) { 4839 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 4840 "runtime pointer checks needed. Enable vectorization of this " 4841 "loop with '#pragma clang loop vectorize(enable)' when " 4842 "compiling with -Os/-Oz", 4843 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4844 return true; 4845 } 4846 4847 if (!PSE.getPredicate().isAlwaysTrue()) { 4848 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 4849 "runtime SCEV checks needed. Enable vectorization of this " 4850 "loop with '#pragma clang loop vectorize(enable)' when " 4851 "compiling with -Os/-Oz", 4852 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4853 return true; 4854 } 4855 4856 // FIXME: Avoid specializing for stride==1 instead of bailing out. 4857 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 4858 reportVectorizationFailure("Runtime stride check for small trip count", 4859 "runtime stride == 1 checks needed. Enable vectorization of " 4860 "this loop without such check by compiling with -Os/-Oz", 4861 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4862 return true; 4863 } 4864 4865 return false; 4866 } 4867 4868 ElementCount 4869 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 4870 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) 4871 return ElementCount::getScalable(0); 4872 4873 if (Hints->isScalableVectorizationDisabled()) { 4874 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 4875 "ScalableVectorizationDisabled", ORE, TheLoop); 4876 return ElementCount::getScalable(0); 4877 } 4878 4879 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); 4880 4881 auto MaxScalableVF = ElementCount::getScalable( 4882 std::numeric_limits<ElementCount::ScalarTy>::max()); 4883 4884 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 4885 // FIXME: While for scalable vectors this is currently sufficient, this should 4886 // be replaced by a more detailed mechanism that filters out specific VFs, 4887 // instead of invalidating vectorization for a whole set of VFs based on the 4888 // MaxVF. 4889 4890 // Disable scalable vectorization if the loop contains unsupported reductions. 4891 if (!canVectorizeReductions(MaxScalableVF)) { 4892 reportVectorizationInfo( 4893 "Scalable vectorization not supported for the reduction " 4894 "operations found in this loop.", 4895 "ScalableVFUnfeasible", ORE, TheLoop); 4896 return ElementCount::getScalable(0); 4897 } 4898 4899 // Disable scalable vectorization if the loop contains any instructions 4900 // with element types not supported for scalable vectors. 4901 if (any_of(ElementTypesInLoop, [&](Type *Ty) { 4902 return !Ty->isVoidTy() && 4903 !this->TTI.isElementTypeLegalForScalableVector(Ty); 4904 })) { 4905 reportVectorizationInfo("Scalable vectorization is not supported " 4906 "for all element types found in this loop.", 4907 "ScalableVFUnfeasible", ORE, TheLoop); 4908 return ElementCount::getScalable(0); 4909 } 4910 4911 if (Legal->isSafeForAnyVectorWidth()) 4912 return MaxScalableVF; 4913 4914 // Limit MaxScalableVF by the maximum safe dependence distance. 4915 std::optional<unsigned> MaxVScale = TTI.getMaxVScale(); 4916 if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) 4917 MaxVScale = 4918 TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); 4919 MaxScalableVF = 4920 ElementCount::getScalable(MaxVScale ? (MaxSafeElements / *MaxVScale) : 0); 4921 if (!MaxScalableVF) 4922 reportVectorizationInfo( 4923 "Max legal vector width too small, scalable vectorization " 4924 "unfeasible.", 4925 "ScalableVFUnfeasible", ORE, TheLoop); 4926 4927 return MaxScalableVF; 4928 } 4929 4930 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( 4931 unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) { 4932 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 4933 unsigned SmallestType, WidestType; 4934 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 4935 4936 // Get the maximum safe dependence distance in bits computed by LAA. 4937 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 4938 // the memory accesses that is most restrictive (involved in the smallest 4939 // dependence distance). 4940 unsigned MaxSafeElements = 4941 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 4942 4943 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 4944 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 4945 4946 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 4947 << ".\n"); 4948 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 4949 << ".\n"); 4950 4951 // First analyze the UserVF, fall back if the UserVF should be ignored. 4952 if (UserVF) { 4953 auto MaxSafeUserVF = 4954 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 4955 4956 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { 4957 // If `VF=vscale x N` is safe, then so is `VF=N` 4958 if (UserVF.isScalable()) 4959 return FixedScalableVFPair( 4960 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); 4961 else 4962 return UserVF; 4963 } 4964 4965 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 4966 4967 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 4968 // is better to ignore the hint and let the compiler choose a suitable VF. 4969 if (!UserVF.isScalable()) { 4970 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 4971 << " is unsafe, clamping to max safe VF=" 4972 << MaxSafeFixedVF << ".\n"); 4973 ORE->emit([&]() { 4974 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 4975 TheLoop->getStartLoc(), 4976 TheLoop->getHeader()) 4977 << "User-specified vectorization factor " 4978 << ore::NV("UserVectorizationFactor", UserVF) 4979 << " is unsafe, clamping to maximum safe vectorization factor " 4980 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 4981 }); 4982 return MaxSafeFixedVF; 4983 } 4984 4985 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 4986 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 4987 << " is ignored because scalable vectors are not " 4988 "available.\n"); 4989 ORE->emit([&]() { 4990 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 4991 TheLoop->getStartLoc(), 4992 TheLoop->getHeader()) 4993 << "User-specified vectorization factor " 4994 << ore::NV("UserVectorizationFactor", UserVF) 4995 << " is ignored because the target does not support scalable " 4996 "vectors. The compiler will pick a more suitable value."; 4997 }); 4998 } else { 4999 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5000 << " is unsafe. Ignoring scalable UserVF.\n"); 5001 ORE->emit([&]() { 5002 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5003 TheLoop->getStartLoc(), 5004 TheLoop->getHeader()) 5005 << "User-specified vectorization factor " 5006 << ore::NV("UserVectorizationFactor", UserVF) 5007 << " is unsafe. Ignoring the hint to let the compiler pick a " 5008 "more suitable value."; 5009 }); 5010 } 5011 } 5012 5013 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5014 << " / " << WidestType << " bits.\n"); 5015 5016 FixedScalableVFPair Result(ElementCount::getFixed(1), 5017 ElementCount::getScalable(0)); 5018 if (auto MaxVF = 5019 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 5020 MaxSafeFixedVF, FoldTailByMasking)) 5021 Result.FixedVF = MaxVF; 5022 5023 if (auto MaxVF = 5024 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 5025 MaxSafeScalableVF, FoldTailByMasking)) 5026 if (MaxVF.isScalable()) { 5027 Result.ScalableVF = MaxVF; 5028 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 5029 << "\n"); 5030 } 5031 5032 return Result; 5033 } 5034 5035 FixedScalableVFPair 5036 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5037 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5038 // TODO: It may by useful to do since it's still likely to be dynamically 5039 // uniform if the target can skip. 5040 reportVectorizationFailure( 5041 "Not inserting runtime ptr check for divergent target", 5042 "runtime pointer checks needed. Not enabled for divergent target", 5043 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5044 return FixedScalableVFPair::getNone(); 5045 } 5046 5047 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5048 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5049 if (TC == 1) { 5050 reportVectorizationFailure("Single iteration (non) loop", 5051 "loop trip count is one, irrelevant for vectorization", 5052 "SingleIterationLoop", ORE, TheLoop); 5053 return FixedScalableVFPair::getNone(); 5054 } 5055 5056 switch (ScalarEpilogueStatus) { 5057 case CM_ScalarEpilogueAllowed: 5058 return computeFeasibleMaxVF(TC, UserVF, false); 5059 case CM_ScalarEpilogueNotAllowedUsePredicate: 5060 [[fallthrough]]; 5061 case CM_ScalarEpilogueNotNeededUsePredicate: 5062 LLVM_DEBUG( 5063 dbgs() << "LV: vector predicate hint/switch found.\n" 5064 << "LV: Not allowing scalar epilogue, creating predicated " 5065 << "vector loop.\n"); 5066 break; 5067 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5068 // fallthrough as a special case of OptForSize 5069 case CM_ScalarEpilogueNotAllowedOptSize: 5070 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5071 LLVM_DEBUG( 5072 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5073 else 5074 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5075 << "count.\n"); 5076 5077 // Bail if runtime checks are required, which are not good when optimising 5078 // for size. 5079 if (runtimeChecksRequired()) 5080 return FixedScalableVFPair::getNone(); 5081 5082 break; 5083 } 5084 5085 // The only loops we can vectorize without a scalar epilogue, are loops with 5086 // a bottom-test and a single exiting block. We'd have to handle the fact 5087 // that not every instruction executes on the last iteration. This will 5088 // require a lane mask which varies through the vector loop body. (TODO) 5089 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5090 // If there was a tail-folding hint/switch, but we can't fold the tail by 5091 // masking, fallback to a vectorization with a scalar epilogue. 5092 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5093 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5094 "scalar epilogue instead.\n"); 5095 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5096 return computeFeasibleMaxVF(TC, UserVF, false); 5097 } 5098 return FixedScalableVFPair::getNone(); 5099 } 5100 5101 // Now try the tail folding 5102 5103 // Invalidate interleave groups that require an epilogue if we can't mask 5104 // the interleave-group. 5105 if (!useMaskedInterleavedAccesses(TTI)) { 5106 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5107 "No decisions should have been taken at this point"); 5108 // Note: There is no need to invalidate any cost modeling decisions here, as 5109 // non where taken so far. 5110 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5111 } 5112 5113 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true); 5114 // Avoid tail folding if the trip count is known to be a multiple of any VF 5115 // we chose. 5116 // FIXME: The condition below pessimises the case for fixed-width vectors, 5117 // when scalable VFs are also candidates for vectorization. 5118 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) { 5119 ElementCount MaxFixedVF = MaxFactors.FixedVF; 5120 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && 5121 "MaxFixedVF must be a power of 2"); 5122 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC 5123 : MaxFixedVF.getFixedValue(); 5124 ScalarEvolution *SE = PSE.getSE(); 5125 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5126 const SCEV *ExitCount = SE->getAddExpr( 5127 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5128 const SCEV *Rem = SE->getURemExpr( 5129 SE->applyLoopGuards(ExitCount, TheLoop), 5130 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5131 if (Rem->isZero()) { 5132 // Accept MaxFixedVF if we do not have a tail. 5133 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5134 return MaxFactors; 5135 } 5136 } 5137 5138 // If we don't know the precise trip count, or if the trip count that we 5139 // found modulo the vectorization factor is not zero, try to fold the tail 5140 // by masking. 5141 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5142 if (Legal->prepareToFoldTailByMasking()) { 5143 FoldTailByMasking = true; 5144 return MaxFactors; 5145 } 5146 5147 // If there was a tail-folding hint/switch, but we can't fold the tail by 5148 // masking, fallback to a vectorization with a scalar epilogue. 5149 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5150 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5151 "scalar epilogue instead.\n"); 5152 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5153 return MaxFactors; 5154 } 5155 5156 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5157 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5158 return FixedScalableVFPair::getNone(); 5159 } 5160 5161 if (TC == 0) { 5162 reportVectorizationFailure( 5163 "Unable to calculate the loop count due to complex control flow", 5164 "unable to calculate the loop count due to complex control flow", 5165 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5166 return FixedScalableVFPair::getNone(); 5167 } 5168 5169 reportVectorizationFailure( 5170 "Cannot optimize for size and vectorize at the same time.", 5171 "cannot optimize for size and vectorize at the same time. " 5172 "Enable vectorization of this loop with '#pragma clang loop " 5173 "vectorize(enable)' when compiling with -Os/-Oz", 5174 "NoTailLoopWithOptForSize", ORE, TheLoop); 5175 return FixedScalableVFPair::getNone(); 5176 } 5177 5178 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 5179 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, 5180 ElementCount MaxSafeVF, bool FoldTailByMasking) { 5181 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 5182 const TypeSize WidestRegister = TTI.getRegisterBitWidth( 5183 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5184 : TargetTransformInfo::RGK_FixedWidthVector); 5185 5186 // Convenience function to return the minimum of two ElementCounts. 5187 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 5188 assert((LHS.isScalable() == RHS.isScalable()) && 5189 "Scalable flags must match"); 5190 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 5191 }; 5192 5193 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5194 // Note that both WidestRegister and WidestType may not be a powers of 2. 5195 auto MaxVectorElementCount = ElementCount::get( 5196 PowerOf2Floor(WidestRegister.getKnownMinValue() / WidestType), 5197 ComputeScalableMaxVF); 5198 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 5199 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5200 << (MaxVectorElementCount * WidestType) << " bits.\n"); 5201 5202 if (!MaxVectorElementCount) { 5203 LLVM_DEBUG(dbgs() << "LV: The target has no " 5204 << (ComputeScalableMaxVF ? "scalable" : "fixed") 5205 << " vector registers.\n"); 5206 return ElementCount::getFixed(1); 5207 } 5208 5209 unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue(); 5210 if (MaxVectorElementCount.isScalable() && 5211 TheFunction->hasFnAttribute(Attribute::VScaleRange)) { 5212 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange); 5213 auto Min = Attr.getVScaleRangeMin(); 5214 WidestRegisterMinEC *= Min; 5215 } 5216 if (ConstTripCount && ConstTripCount <= WidestRegisterMinEC && 5217 (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) { 5218 // If loop trip count (TC) is known at compile time there is no point in 5219 // choosing VF greater than TC (as done in the loop below). Select maximum 5220 // power of two which doesn't exceed TC. 5221 // If MaxVectorElementCount is scalable, we only fall back on a fixed VF 5222 // when the TC is less than or equal to the known number of lanes. 5223 auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount); 5224 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " 5225 "exceeding the constant trip count: " 5226 << ClampedConstTripCount << "\n"); 5227 return ElementCount::getFixed(ClampedConstTripCount); 5228 } 5229 5230 TargetTransformInfo::RegisterKind RegKind = 5231 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5232 : TargetTransformInfo::RGK_FixedWidthVector; 5233 ElementCount MaxVF = MaxVectorElementCount; 5234 if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 && 5235 TTI.shouldMaximizeVectorBandwidth(RegKind))) { 5236 auto MaxVectorElementCountMaxBW = ElementCount::get( 5237 PowerOf2Floor(WidestRegister.getKnownMinValue() / SmallestType), 5238 ComputeScalableMaxVF); 5239 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 5240 5241 // Collect all viable vectorization factors larger than the default MaxVF 5242 // (i.e. MaxVectorElementCount). 5243 SmallVector<ElementCount, 8> VFs; 5244 for (ElementCount VS = MaxVectorElementCount * 2; 5245 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 5246 VFs.push_back(VS); 5247 5248 // For each VF calculate its register usage. 5249 auto RUs = calculateRegisterUsage(VFs); 5250 5251 // Select the largest VF which doesn't require more registers than existing 5252 // ones. 5253 for (int i = RUs.size() - 1; i >= 0; --i) { 5254 bool Selected = true; 5255 for (auto &pair : RUs[i].MaxLocalUsers) { 5256 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5257 if (pair.second > TargetNumRegisters) 5258 Selected = false; 5259 } 5260 if (Selected) { 5261 MaxVF = VFs[i]; 5262 break; 5263 } 5264 } 5265 if (ElementCount MinVF = 5266 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 5267 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5268 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5269 << ") with target's minimum: " << MinVF << '\n'); 5270 MaxVF = MinVF; 5271 } 5272 } 5273 5274 // Invalidate any widening decisions we might have made, in case the loop 5275 // requires prediction (decided later), but we have already made some 5276 // load/store widening decisions. 5277 invalidateCostModelingDecisions(); 5278 } 5279 return MaxVF; 5280 } 5281 5282 std::optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const { 5283 if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) { 5284 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange); 5285 auto Min = Attr.getVScaleRangeMin(); 5286 auto Max = Attr.getVScaleRangeMax(); 5287 if (Max && Min == Max) 5288 return Max; 5289 } 5290 5291 return TTI.getVScaleForTuning(); 5292 } 5293 5294 bool LoopVectorizationCostModel::isMoreProfitable( 5295 const VectorizationFactor &A, const VectorizationFactor &B) const { 5296 InstructionCost CostA = A.Cost; 5297 InstructionCost CostB = B.Cost; 5298 5299 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 5300 5301 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && 5302 MaxTripCount) { 5303 // If we are folding the tail and the trip count is a known (possibly small) 5304 // constant, the trip count will be rounded up to an integer number of 5305 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), 5306 // which we compare directly. When not folding the tail, the total cost will 5307 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is 5308 // approximated with the per-lane cost below instead of using the tripcount 5309 // as here. 5310 auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); 5311 auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); 5312 return RTCostA < RTCostB; 5313 } 5314 5315 // Improve estimate for the vector width if it is scalable. 5316 unsigned EstimatedWidthA = A.Width.getKnownMinValue(); 5317 unsigned EstimatedWidthB = B.Width.getKnownMinValue(); 5318 if (std::optional<unsigned> VScale = getVScaleForTuning()) { 5319 if (A.Width.isScalable()) 5320 EstimatedWidthA *= *VScale; 5321 if (B.Width.isScalable()) 5322 EstimatedWidthB *= *VScale; 5323 } 5324 5325 // Assume vscale may be larger than 1 (or the value being tuned for), 5326 // so that scalable vectorization is slightly favorable over fixed-width 5327 // vectorization. 5328 if (A.Width.isScalable() && !B.Width.isScalable()) 5329 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); 5330 5331 // To avoid the need for FP division: 5332 // (CostA / A.Width) < (CostB / B.Width) 5333 // <=> (CostA * B.Width) < (CostB * A.Width) 5334 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA); 5335 } 5336 5337 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( 5338 const ElementCountSet &VFCandidates) { 5339 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 5340 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5341 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5342 assert(VFCandidates.count(ElementCount::getFixed(1)) && 5343 "Expected Scalar VF to be a candidate"); 5344 5345 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost, 5346 ExpectedCost); 5347 VectorizationFactor ChosenFactor = ScalarCost; 5348 5349 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5350 if (ForceVectorization && VFCandidates.size() > 1) { 5351 // Ignore scalar width, because the user explicitly wants vectorization. 5352 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5353 // evaluation. 5354 ChosenFactor.Cost = InstructionCost::getMax(); 5355 } 5356 5357 SmallVector<InstructionVFPair> InvalidCosts; 5358 for (const auto &i : VFCandidates) { 5359 // The cost for scalar VF=1 is already calculated, so ignore it. 5360 if (i.isScalar()) 5361 continue; 5362 5363 VectorizationCostTy C = expectedCost(i, &InvalidCosts); 5364 VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost); 5365 5366 #ifndef NDEBUG 5367 unsigned AssumedMinimumVscale = 1; 5368 if (std::optional<unsigned> VScale = getVScaleForTuning()) 5369 AssumedMinimumVscale = *VScale; 5370 unsigned Width = 5371 Candidate.Width.isScalable() 5372 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale 5373 : Candidate.Width.getFixedValue(); 5374 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5375 << " costs: " << (Candidate.Cost / Width)); 5376 if (i.isScalable()) 5377 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " 5378 << AssumedMinimumVscale << ")"); 5379 LLVM_DEBUG(dbgs() << ".\n"); 5380 #endif 5381 5382 if (!C.second && !ForceVectorization) { 5383 LLVM_DEBUG( 5384 dbgs() << "LV: Not considering vector loop of width " << i 5385 << " because it will not generate any vector instructions.\n"); 5386 continue; 5387 } 5388 5389 // If profitable add it to ProfitableVF list. 5390 if (isMoreProfitable(Candidate, ScalarCost)) 5391 ProfitableVFs.push_back(Candidate); 5392 5393 if (isMoreProfitable(Candidate, ChosenFactor)) 5394 ChosenFactor = Candidate; 5395 } 5396 5397 // Emit a report of VFs with invalid costs in the loop. 5398 if (!InvalidCosts.empty()) { 5399 // Group the remarks per instruction, keeping the instruction order from 5400 // InvalidCosts. 5401 std::map<Instruction *, unsigned> Numbering; 5402 unsigned I = 0; 5403 for (auto &Pair : InvalidCosts) 5404 if (!Numbering.count(Pair.first)) 5405 Numbering[Pair.first] = I++; 5406 5407 // Sort the list, first on instruction(number) then on VF. 5408 llvm::sort(InvalidCosts, 5409 [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { 5410 if (Numbering[A.first] != Numbering[B.first]) 5411 return Numbering[A.first] < Numbering[B.first]; 5412 ElementCountComparator ECC; 5413 return ECC(A.second, B.second); 5414 }); 5415 5416 // For a list of ordered instruction-vf pairs: 5417 // [(load, vf1), (load, vf2), (store, vf1)] 5418 // Group the instructions together to emit separate remarks for: 5419 // load (vf1, vf2) 5420 // store (vf1) 5421 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); 5422 auto Subset = ArrayRef<InstructionVFPair>(); 5423 do { 5424 if (Subset.empty()) 5425 Subset = Tail.take_front(1); 5426 5427 Instruction *I = Subset.front().first; 5428 5429 // If the next instruction is different, or if there are no other pairs, 5430 // emit a remark for the collated subset. e.g. 5431 // [(load, vf1), (load, vf2))] 5432 // to emit: 5433 // remark: invalid costs for 'load' at VF=(vf, vf2) 5434 if (Subset == Tail || Tail[Subset.size()].first != I) { 5435 std::string OutString; 5436 raw_string_ostream OS(OutString); 5437 assert(!Subset.empty() && "Unexpected empty range"); 5438 OS << "Instruction with invalid costs prevented vectorization at VF=("; 5439 for (const auto &Pair : Subset) 5440 OS << (Pair.second == Subset.front().second ? "" : ", ") 5441 << Pair.second; 5442 OS << "):"; 5443 if (auto *CI = dyn_cast<CallInst>(I)) 5444 OS << " call to " << CI->getCalledFunction()->getName(); 5445 else 5446 OS << " " << I->getOpcodeName(); 5447 OS.flush(); 5448 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); 5449 Tail = Tail.drop_front(Subset.size()); 5450 Subset = {}; 5451 } else 5452 // Grow the subset by one element 5453 Subset = Tail.take_front(Subset.size() + 1); 5454 } while (!Tail.empty()); 5455 } 5456 5457 if (!EnableCondStoresVectorization && NumPredStores) { 5458 reportVectorizationFailure("There are conditional stores.", 5459 "store that is conditionally executed prevents vectorization", 5460 "ConditionalStore", ORE, TheLoop); 5461 ChosenFactor = ScalarCost; 5462 } 5463 5464 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 5465 !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs() 5466 << "LV: Vectorization seems to be not beneficial, " 5467 << "but was forced by a user.\n"); 5468 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 5469 return ChosenFactor; 5470 } 5471 5472 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5473 const Loop &L, ElementCount VF) const { 5474 // Cross iteration phis such as reductions need special handling and are 5475 // currently unsupported. 5476 if (any_of(L.getHeader()->phis(), 5477 [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); })) 5478 return false; 5479 5480 // Phis with uses outside of the loop require special handling and are 5481 // currently unsupported. 5482 for (const auto &Entry : Legal->getInductionVars()) { 5483 // Look for uses of the value of the induction at the last iteration. 5484 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5485 for (User *U : PostInc->users()) 5486 if (!L.contains(cast<Instruction>(U))) 5487 return false; 5488 // Look for uses of penultimate value of the induction. 5489 for (User *U : Entry.first->users()) 5490 if (!L.contains(cast<Instruction>(U))) 5491 return false; 5492 } 5493 5494 // Epilogue vectorization code has not been auditted to ensure it handles 5495 // non-latch exits properly. It may be fine, but it needs auditted and 5496 // tested. 5497 if (L.getExitingBlock() != L.getLoopLatch()) 5498 return false; 5499 5500 return true; 5501 } 5502 5503 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5504 const ElementCount VF) const { 5505 // FIXME: We need a much better cost-model to take different parameters such 5506 // as register pressure, code size increase and cost of extra branches into 5507 // account. For now we apply a very crude heuristic and only consider loops 5508 // with vectorization factors larger than a certain value. 5509 5510 // Allow the target to opt out entirely. 5511 if (!TTI.preferEpilogueVectorization()) 5512 return false; 5513 5514 // We also consider epilogue vectorization unprofitable for targets that don't 5515 // consider interleaving beneficial (eg. MVE). 5516 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5517 return false; 5518 // FIXME: We should consider changing the threshold for scalable 5519 // vectors to take VScaleForTuning into account. 5520 if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF) 5521 return true; 5522 return false; 5523 } 5524 5525 VectorizationFactor 5526 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 5527 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 5528 VectorizationFactor Result = VectorizationFactor::Disabled(); 5529 if (!EnableEpilogueVectorization) { 5530 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 5531 return Result; 5532 } 5533 5534 if (!isScalarEpilogueAllowed()) { 5535 LLVM_DEBUG( 5536 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 5537 "allowed.\n";); 5538 return Result; 5539 } 5540 5541 // Not really a cost consideration, but check for unsupported cases here to 5542 // simplify the logic. 5543 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 5544 LLVM_DEBUG( 5545 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 5546 "not a supported candidate.\n";); 5547 return Result; 5548 } 5549 5550 if (EpilogueVectorizationForceVF > 1) { 5551 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 5552 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); 5553 if (LVP.hasPlanWithVF(ForcedEC)) 5554 return {ForcedEC, 0, 0}; 5555 else { 5556 LLVM_DEBUG( 5557 dbgs() 5558 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 5559 return Result; 5560 } 5561 } 5562 5563 if (TheLoop->getHeader()->getParent()->hasOptSize() || 5564 TheLoop->getHeader()->getParent()->hasMinSize()) { 5565 LLVM_DEBUG( 5566 dbgs() 5567 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 5568 return Result; 5569 } 5570 5571 if (!isEpilogueVectorizationProfitable(MainLoopVF)) { 5572 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " 5573 "this loop\n"); 5574 return Result; 5575 } 5576 5577 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know 5578 // the main loop handles 8 lanes per iteration. We could still benefit from 5579 // vectorizing the epilogue loop with VF=4. 5580 ElementCount EstimatedRuntimeVF = MainLoopVF; 5581 if (MainLoopVF.isScalable()) { 5582 EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); 5583 if (std::optional<unsigned> VScale = getVScaleForTuning()) 5584 EstimatedRuntimeVF *= *VScale; 5585 } 5586 5587 for (auto &NextVF : ProfitableVFs) 5588 if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && 5589 ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) || 5590 ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) && 5591 (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) && 5592 LVP.hasPlanWithVF(NextVF.Width)) 5593 Result = NextVF; 5594 5595 if (Result != VectorizationFactor::Disabled()) 5596 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 5597 << Result.Width << "\n";); 5598 return Result; 5599 } 5600 5601 std::pair<unsigned, unsigned> 5602 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5603 unsigned MinWidth = -1U; 5604 unsigned MaxWidth = 8; 5605 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5606 // For in-loop reductions, no element types are added to ElementTypesInLoop 5607 // if there are no loads/stores in the loop. In this case, check through the 5608 // reduction variables to determine the maximum width. 5609 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) { 5610 // Reset MaxWidth so that we can find the smallest type used by recurrences 5611 // in the loop. 5612 MaxWidth = -1U; 5613 for (const auto &PhiDescriptorPair : Legal->getReductionVars()) { 5614 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second; 5615 // When finding the min width used by the recurrence we need to account 5616 // for casts on the input operands of the recurrence. 5617 MaxWidth = std::min<unsigned>( 5618 MaxWidth, std::min<unsigned>( 5619 RdxDesc.getMinWidthCastToRecurrenceTypeInBits(), 5620 RdxDesc.getRecurrenceType()->getScalarSizeInBits())); 5621 } 5622 } else { 5623 for (Type *T : ElementTypesInLoop) { 5624 MinWidth = std::min<unsigned>( 5625 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue()); 5626 MaxWidth = std::max<unsigned>( 5627 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue()); 5628 } 5629 } 5630 return {MinWidth, MaxWidth}; 5631 } 5632 5633 void LoopVectorizationCostModel::collectElementTypesForWidening() { 5634 ElementTypesInLoop.clear(); 5635 // For each block. 5636 for (BasicBlock *BB : TheLoop->blocks()) { 5637 // For each instruction in the loop. 5638 for (Instruction &I : BB->instructionsWithoutDebug()) { 5639 Type *T = I.getType(); 5640 5641 // Skip ignored values. 5642 if (ValuesToIgnore.count(&I)) 5643 continue; 5644 5645 // Only examine Loads, Stores and PHINodes. 5646 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5647 continue; 5648 5649 // Examine PHI nodes that are reduction variables. Update the type to 5650 // account for the recurrence type. 5651 if (auto *PN = dyn_cast<PHINode>(&I)) { 5652 if (!Legal->isReductionVariable(PN)) 5653 continue; 5654 const RecurrenceDescriptor &RdxDesc = 5655 Legal->getReductionVars().find(PN)->second; 5656 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 5657 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 5658 RdxDesc.getRecurrenceType(), 5659 TargetTransformInfo::ReductionFlags())) 5660 continue; 5661 T = RdxDesc.getRecurrenceType(); 5662 } 5663 5664 // Examine the stored values. 5665 if (auto *ST = dyn_cast<StoreInst>(&I)) 5666 T = ST->getValueOperand()->getType(); 5667 5668 assert(T->isSized() && 5669 "Expected the load/store/recurrence type to be sized"); 5670 5671 ElementTypesInLoop.insert(T); 5672 } 5673 } 5674 } 5675 5676 unsigned 5677 LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 5678 InstructionCost LoopCost) { 5679 // -- The interleave heuristics -- 5680 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5681 // There are many micro-architectural considerations that we can't predict 5682 // at this level. For example, frontend pressure (on decode or fetch) due to 5683 // code size, or the number and capabilities of the execution ports. 5684 // 5685 // We use the following heuristics to select the interleave count: 5686 // 1. If the code has reductions, then we interleave to break the cross 5687 // iteration dependency. 5688 // 2. If the loop is really small, then we interleave to reduce the loop 5689 // overhead. 5690 // 3. We don't interleave if we think that we will spill registers to memory 5691 // due to the increased register pressure. 5692 5693 if (!isScalarEpilogueAllowed()) 5694 return 1; 5695 5696 // We used the distance for the interleave count. 5697 if (Legal->getMaxSafeDepDistBytes() != -1U) 5698 return 1; 5699 5700 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5701 const bool HasReductions = !Legal->getReductionVars().empty(); 5702 // Do not interleave loops with a relatively small known or estimated trip 5703 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 5704 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 5705 // because with the above conditions interleaving can expose ILP and break 5706 // cross iteration dependences for reductions. 5707 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 5708 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 5709 return 1; 5710 5711 // If we did not calculate the cost for VF (because the user selected the VF) 5712 // then we calculate the cost of VF here. 5713 if (LoopCost == 0) { 5714 LoopCost = expectedCost(VF).first; 5715 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost"); 5716 5717 // Loop body is free and there is no need for interleaving. 5718 if (LoopCost == 0) 5719 return 1; 5720 } 5721 5722 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5723 // We divide by these constants so assume that we have at least one 5724 // instruction that uses at least one register. 5725 for (auto& pair : R.MaxLocalUsers) { 5726 pair.second = std::max(pair.second, 1U); 5727 } 5728 5729 // We calculate the interleave count using the following formula. 5730 // Subtract the number of loop invariants from the number of available 5731 // registers. These registers are used by all of the interleaved instances. 5732 // Next, divide the remaining registers by the number of registers that is 5733 // required by the loop, in order to estimate how many parallel instances 5734 // fit without causing spills. All of this is rounded down if necessary to be 5735 // a power of two. We want power of two interleave count to simplify any 5736 // addressing operations or alignment considerations. 5737 // We also want power of two interleave counts to ensure that the induction 5738 // variable of the vector loop wraps to zero, when tail is folded by masking; 5739 // this currently happens when OptForSize, in which case IC is set to 1 above. 5740 unsigned IC = UINT_MAX; 5741 5742 for (auto& pair : R.MaxLocalUsers) { 5743 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5744 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5745 << " registers of " 5746 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5747 if (VF.isScalar()) { 5748 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5749 TargetNumRegisters = ForceTargetNumScalarRegs; 5750 } else { 5751 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5752 TargetNumRegisters = ForceTargetNumVectorRegs; 5753 } 5754 unsigned MaxLocalUsers = pair.second; 5755 unsigned LoopInvariantRegs = 0; 5756 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 5757 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 5758 5759 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 5760 // Don't count the induction variable as interleaved. 5761 if (EnableIndVarRegisterHeur) { 5762 TmpIC = 5763 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5764 std::max(1U, (MaxLocalUsers - 1))); 5765 } 5766 5767 IC = std::min(IC, TmpIC); 5768 } 5769 5770 // Clamp the interleave ranges to reasonable counts. 5771 unsigned MaxInterleaveCount = 5772 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 5773 5774 // Check if the user has overridden the max. 5775 if (VF.isScalar()) { 5776 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5777 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5778 } else { 5779 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5780 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5781 } 5782 5783 // If trip count is known or estimated compile time constant, limit the 5784 // interleave count to be less than the trip count divided by VF, provided it 5785 // is at least 1. 5786 // 5787 // For scalable vectors we can't know if interleaving is beneficial. It may 5788 // not be beneficial for small loops if none of the lanes in the second vector 5789 // iterations is enabled. However, for larger loops, there is likely to be a 5790 // similar benefit as for fixed-width vectors. For now, we choose to leave 5791 // the InterleaveCount as if vscale is '1', although if some information about 5792 // the vector is known (e.g. min vector size), we can make a better decision. 5793 if (BestKnownTC) { 5794 MaxInterleaveCount = 5795 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 5796 // Make sure MaxInterleaveCount is greater than 0. 5797 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 5798 } 5799 5800 assert(MaxInterleaveCount > 0 && 5801 "Maximum interleave count must be greater than 0"); 5802 5803 // Clamp the calculated IC to be between the 1 and the max interleave count 5804 // that the target and trip count allows. 5805 if (IC > MaxInterleaveCount) 5806 IC = MaxInterleaveCount; 5807 else 5808 // Make sure IC is greater than 0. 5809 IC = std::max(1u, IC); 5810 5811 assert(IC > 0 && "Interleave count must be greater than 0."); 5812 5813 // Interleave if we vectorized this loop and there is a reduction that could 5814 // benefit from interleaving. 5815 if (VF.isVector() && HasReductions) { 5816 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5817 return IC; 5818 } 5819 5820 // For any scalar loop that either requires runtime checks or predication we 5821 // are better off leaving this to the unroller. Note that if we've already 5822 // vectorized the loop we will have done the runtime check and so interleaving 5823 // won't require further checks. 5824 bool ScalarInterleavingRequiresPredication = 5825 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) { 5826 return Legal->blockNeedsPredication(BB); 5827 })); 5828 bool ScalarInterleavingRequiresRuntimePointerCheck = 5829 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 5830 5831 // We want to interleave small loops in order to reduce the loop overhead and 5832 // potentially expose ILP opportunities. 5833 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 5834 << "LV: IC is " << IC << '\n' 5835 << "LV: VF is " << VF << '\n'); 5836 const bool AggressivelyInterleaveReductions = 5837 TTI.enableAggressiveInterleaving(HasReductions); 5838 if (!ScalarInterleavingRequiresRuntimePointerCheck && 5839 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) { 5840 // We assume that the cost overhead is 1 and we use the cost model 5841 // to estimate the cost of the loop and interleave until the cost of the 5842 // loop overhead is about 5% of the cost of the loop. 5843 unsigned SmallIC = std::min( 5844 IC, (unsigned)PowerOf2Floor(SmallLoopCost / *LoopCost.getValue())); 5845 5846 // Interleave until store/load ports (estimated by max interleave count) are 5847 // saturated. 5848 unsigned NumStores = Legal->getNumStores(); 5849 unsigned NumLoads = Legal->getNumLoads(); 5850 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5851 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5852 5853 // There is little point in interleaving for reductions containing selects 5854 // and compares when VF=1 since it may just create more overhead than it's 5855 // worth for loops with small trip counts. This is because we still have to 5856 // do the final reduction after the loop. 5857 bool HasSelectCmpReductions = 5858 HasReductions && 5859 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 5860 const RecurrenceDescriptor &RdxDesc = Reduction.second; 5861 return RecurrenceDescriptor::isSelectCmpRecurrenceKind( 5862 RdxDesc.getRecurrenceKind()); 5863 }); 5864 if (HasSelectCmpReductions) { 5865 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); 5866 return 1; 5867 } 5868 5869 // If we have a scalar reduction (vector reductions are already dealt with 5870 // by this point), we can increase the critical path length if the loop 5871 // we're interleaving is inside another loop. For tree-wise reductions 5872 // set the limit to 2, and for ordered reductions it's best to disable 5873 // interleaving entirely. 5874 if (HasReductions && TheLoop->getLoopDepth() > 1) { 5875 bool HasOrderedReductions = 5876 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 5877 const RecurrenceDescriptor &RdxDesc = Reduction.second; 5878 return RdxDesc.isOrdered(); 5879 }); 5880 if (HasOrderedReductions) { 5881 LLVM_DEBUG( 5882 dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); 5883 return 1; 5884 } 5885 5886 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5887 SmallIC = std::min(SmallIC, F); 5888 StoresIC = std::min(StoresIC, F); 5889 LoadsIC = std::min(LoadsIC, F); 5890 } 5891 5892 if (EnableLoadStoreRuntimeInterleave && 5893 std::max(StoresIC, LoadsIC) > SmallIC) { 5894 LLVM_DEBUG( 5895 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5896 return std::max(StoresIC, LoadsIC); 5897 } 5898 5899 // If there are scalar reductions and TTI has enabled aggressive 5900 // interleaving for reductions, we will interleave to expose ILP. 5901 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 5902 AggressivelyInterleaveReductions) { 5903 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5904 // Interleave no less than SmallIC but not as aggressive as the normal IC 5905 // to satisfy the rare situation when resources are too limited. 5906 return std::max(IC / 2, SmallIC); 5907 } else { 5908 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5909 return SmallIC; 5910 } 5911 } 5912 5913 // Interleave if this is a large loop (small loops are already dealt with by 5914 // this point) that could benefit from interleaving. 5915 if (AggressivelyInterleaveReductions) { 5916 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5917 return IC; 5918 } 5919 5920 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5921 return 1; 5922 } 5923 5924 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5925 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 5926 // This function calculates the register usage by measuring the highest number 5927 // of values that are alive at a single location. Obviously, this is a very 5928 // rough estimation. We scan the loop in a topological order in order and 5929 // assign a number to each instruction. We use RPO to ensure that defs are 5930 // met before their users. We assume that each instruction that has in-loop 5931 // users starts an interval. We record every time that an in-loop value is 5932 // used, so we have a list of the first and last occurrences of each 5933 // instruction. Next, we transpose this data structure into a multi map that 5934 // holds the list of intervals that *end* at a specific location. This multi 5935 // map allows us to perform a linear search. We scan the instructions linearly 5936 // and record each time that a new interval starts, by placing it in a set. 5937 // If we find this value in the multi-map then we remove it from the set. 5938 // The max register usage is the maximum size of the set. 5939 // We also search for instructions that are defined outside the loop, but are 5940 // used inside the loop. We need this number separately from the max-interval 5941 // usage number because when we unroll, loop-invariant values do not take 5942 // more register. 5943 LoopBlocksDFS DFS(TheLoop); 5944 DFS.perform(LI); 5945 5946 RegisterUsage RU; 5947 5948 // Each 'key' in the map opens a new interval. The values 5949 // of the map are the index of the 'last seen' usage of the 5950 // instruction that is the key. 5951 using IntervalMap = DenseMap<Instruction *, unsigned>; 5952 5953 // Maps instruction to its index. 5954 SmallVector<Instruction *, 64> IdxToInstr; 5955 // Marks the end of each interval. 5956 IntervalMap EndPoint; 5957 // Saves the list of instruction indices that are used in the loop. 5958 SmallPtrSet<Instruction *, 8> Ends; 5959 // Saves the list of values that are used in the loop but are defined outside 5960 // the loop (not including non-instruction values such as arguments and 5961 // constants). 5962 SmallPtrSet<Value *, 8> LoopInvariants; 5963 5964 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5965 for (Instruction &I : BB->instructionsWithoutDebug()) { 5966 IdxToInstr.push_back(&I); 5967 5968 // Save the end location of each USE. 5969 for (Value *U : I.operands()) { 5970 auto *Instr = dyn_cast<Instruction>(U); 5971 5972 // Ignore non-instruction values such as arguments, constants, etc. 5973 // FIXME: Might need some motivation why these values are ignored. If 5974 // for example an argument is used inside the loop it will increase the 5975 // register pressure (so shouldn't we add it to LoopInvariants). 5976 if (!Instr) 5977 continue; 5978 5979 // If this instruction is outside the loop then record it and continue. 5980 if (!TheLoop->contains(Instr)) { 5981 LoopInvariants.insert(Instr); 5982 continue; 5983 } 5984 5985 // Overwrite previous end points. 5986 EndPoint[Instr] = IdxToInstr.size(); 5987 Ends.insert(Instr); 5988 } 5989 } 5990 } 5991 5992 // Saves the list of intervals that end with the index in 'key'. 5993 using InstrList = SmallVector<Instruction *, 2>; 5994 DenseMap<unsigned, InstrList> TransposeEnds; 5995 5996 // Transpose the EndPoints to a list of values that end at each index. 5997 for (auto &Interval : EndPoint) 5998 TransposeEnds[Interval.second].push_back(Interval.first); 5999 6000 SmallPtrSet<Instruction *, 8> OpenIntervals; 6001 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6002 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6003 6004 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6005 6006 const auto &TTICapture = TTI; 6007 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { 6008 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6009 return 0; 6010 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); 6011 }; 6012 6013 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6014 Instruction *I = IdxToInstr[i]; 6015 6016 // Remove all of the instructions that end at this location. 6017 InstrList &List = TransposeEnds[i]; 6018 for (Instruction *ToRemove : List) 6019 OpenIntervals.erase(ToRemove); 6020 6021 // Ignore instructions that are never used within the loop. 6022 if (!Ends.count(I)) 6023 continue; 6024 6025 // Skip ignored values. 6026 if (ValuesToIgnore.count(I)) 6027 continue; 6028 6029 // For each VF find the maximum usage of registers. 6030 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6031 // Count the number of registers used, per register class, given all open 6032 // intervals. 6033 // Note that elements in this SmallMapVector will be default constructed 6034 // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if 6035 // there is no previous entry for ClassID. 6036 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6037 6038 if (VFs[j].isScalar()) { 6039 for (auto *Inst : OpenIntervals) { 6040 unsigned ClassID = 6041 TTI.getRegisterClassForType(false, Inst->getType()); 6042 // FIXME: The target might use more than one register for the type 6043 // even in the scalar case. 6044 RegUsage[ClassID] += 1; 6045 } 6046 } else { 6047 collectUniformsAndScalars(VFs[j]); 6048 for (auto *Inst : OpenIntervals) { 6049 // Skip ignored values for VF > 1. 6050 if (VecValuesToIgnore.count(Inst)) 6051 continue; 6052 if (isScalarAfterVectorization(Inst, VFs[j])) { 6053 unsigned ClassID = 6054 TTI.getRegisterClassForType(false, Inst->getType()); 6055 // FIXME: The target might use more than one register for the type 6056 // even in the scalar case. 6057 RegUsage[ClassID] += 1; 6058 } else { 6059 unsigned ClassID = 6060 TTI.getRegisterClassForType(true, Inst->getType()); 6061 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6062 } 6063 } 6064 } 6065 6066 for (auto& pair : RegUsage) { 6067 auto &Entry = MaxUsages[j][pair.first]; 6068 Entry = std::max(Entry, pair.second); 6069 } 6070 } 6071 6072 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6073 << OpenIntervals.size() << '\n'); 6074 6075 // Add the current instruction to the list of open intervals. 6076 OpenIntervals.insert(I); 6077 } 6078 6079 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6080 // Note that elements in this SmallMapVector will be default constructed 6081 // as 0. So we can use "Invariant[ClassID] += n" in the code below even if 6082 // there is no previous entry for ClassID. 6083 SmallMapVector<unsigned, unsigned, 4> Invariant; 6084 6085 for (auto *Inst : LoopInvariants) { 6086 // FIXME: The target might use more than one register for the type 6087 // even in the scalar case. 6088 unsigned Usage = 6089 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6090 unsigned ClassID = 6091 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6092 Invariant[ClassID] += Usage; 6093 } 6094 6095 LLVM_DEBUG({ 6096 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6097 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6098 << " item\n"; 6099 for (const auto &pair : MaxUsages[i]) { 6100 dbgs() << "LV(REG): RegisterClass: " 6101 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6102 << " registers\n"; 6103 } 6104 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6105 << " item\n"; 6106 for (const auto &pair : Invariant) { 6107 dbgs() << "LV(REG): RegisterClass: " 6108 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6109 << " registers\n"; 6110 } 6111 }); 6112 6113 RU.LoopInvariantRegs = Invariant; 6114 RU.MaxLocalUsers = MaxUsages[i]; 6115 RUs[i] = RU; 6116 } 6117 6118 return RUs; 6119 } 6120 6121 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, 6122 ElementCount VF) { 6123 // TODO: Cost model for emulated masked load/store is completely 6124 // broken. This hack guides the cost model to use an artificially 6125 // high enough value to practically disable vectorization with such 6126 // operations, except where previously deployed legality hack allowed 6127 // using very low cost values. This is to avoid regressions coming simply 6128 // from moving "masked load/store" check from legality to cost model. 6129 // Masked Load/Gather emulation was previously never allowed. 6130 // Limited number of Masked Store/Scatter emulation was allowed. 6131 assert((isPredicatedInst(I)) && 6132 "Expecting a scalar emulated instruction"); 6133 return isa<LoadInst>(I) || 6134 (isa<StoreInst>(I) && 6135 NumPredStores > NumberOfStoresToPredicate); 6136 } 6137 6138 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6139 // If we aren't vectorizing the loop, or if we've already collected the 6140 // instructions to scalarize, there's nothing to do. Collection may already 6141 // have occurred if we have a user-selected VF and are now computing the 6142 // expected cost for interleaving. 6143 if (VF.isScalar() || VF.isZero() || 6144 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6145 return; 6146 6147 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6148 // not profitable to scalarize any instructions, the presence of VF in the 6149 // map will indicate that we've analyzed it already. 6150 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6151 6152 PredicatedBBsAfterVectorization[VF].clear(); 6153 6154 // Find all the instructions that are scalar with predication in the loop and 6155 // determine if it would be better to not if-convert the blocks they are in. 6156 // If so, we also record the instructions to scalarize. 6157 for (BasicBlock *BB : TheLoop->blocks()) { 6158 if (!blockNeedsPredicationForAnyReason(BB)) 6159 continue; 6160 for (Instruction &I : *BB) 6161 if (isScalarWithPredication(&I, VF)) { 6162 ScalarCostsTy ScalarCosts; 6163 // Do not apply discount if scalable, because that would lead to 6164 // invalid scalarization costs. 6165 // Do not apply discount logic if hacked cost is needed 6166 // for emulated masked memrefs. 6167 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) && 6168 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6169 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6170 // Remember that BB will remain after vectorization. 6171 PredicatedBBsAfterVectorization[VF].insert(BB); 6172 } 6173 } 6174 } 6175 6176 InstructionCost LoopVectorizationCostModel::computePredInstDiscount( 6177 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6178 assert(!isUniformAfterVectorization(PredInst, VF) && 6179 "Instruction marked uniform-after-vectorization will be predicated"); 6180 6181 // Initialize the discount to zero, meaning that the scalar version and the 6182 // vector version cost the same. 6183 InstructionCost Discount = 0; 6184 6185 // Holds instructions to analyze. The instructions we visit are mapped in 6186 // ScalarCosts. Those instructions are the ones that would be scalarized if 6187 // we find that the scalar version costs less. 6188 SmallVector<Instruction *, 8> Worklist; 6189 6190 // Returns true if the given instruction can be scalarized. 6191 auto canBeScalarized = [&](Instruction *I) -> bool { 6192 // We only attempt to scalarize instructions forming a single-use chain 6193 // from the original predicated block that would otherwise be vectorized. 6194 // Although not strictly necessary, we give up on instructions we know will 6195 // already be scalar to avoid traversing chains that are unlikely to be 6196 // beneficial. 6197 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6198 isScalarAfterVectorization(I, VF)) 6199 return false; 6200 6201 // If the instruction is scalar with predication, it will be analyzed 6202 // separately. We ignore it within the context of PredInst. 6203 if (isScalarWithPredication(I, VF)) 6204 return false; 6205 6206 // If any of the instruction's operands are uniform after vectorization, 6207 // the instruction cannot be scalarized. This prevents, for example, a 6208 // masked load from being scalarized. 6209 // 6210 // We assume we will only emit a value for lane zero of an instruction 6211 // marked uniform after vectorization, rather than VF identical values. 6212 // Thus, if we scalarize an instruction that uses a uniform, we would 6213 // create uses of values corresponding to the lanes we aren't emitting code 6214 // for. This behavior can be changed by allowing getScalarValue to clone 6215 // the lane zero values for uniforms rather than asserting. 6216 for (Use &U : I->operands()) 6217 if (auto *J = dyn_cast<Instruction>(U.get())) 6218 if (isUniformAfterVectorization(J, VF)) 6219 return false; 6220 6221 // Otherwise, we can scalarize the instruction. 6222 return true; 6223 }; 6224 6225 // Compute the expected cost discount from scalarizing the entire expression 6226 // feeding the predicated instruction. We currently only consider expressions 6227 // that are single-use instruction chains. 6228 Worklist.push_back(PredInst); 6229 while (!Worklist.empty()) { 6230 Instruction *I = Worklist.pop_back_val(); 6231 6232 // If we've already analyzed the instruction, there's nothing to do. 6233 if (ScalarCosts.find(I) != ScalarCosts.end()) 6234 continue; 6235 6236 // Compute the cost of the vector instruction. Note that this cost already 6237 // includes the scalarization overhead of the predicated instruction. 6238 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6239 6240 // Compute the cost of the scalarized instruction. This cost is the cost of 6241 // the instruction as if it wasn't if-converted and instead remained in the 6242 // predicated block. We will scale this cost by block probability after 6243 // computing the scalarization overhead. 6244 InstructionCost ScalarCost = 6245 VF.getFixedValue() * 6246 getInstructionCost(I, ElementCount::getFixed(1)).first; 6247 6248 // Compute the scalarization overhead of needed insertelement instructions 6249 // and phi nodes. 6250 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6251 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { 6252 ScalarCost += TTI.getScalarizationOverhead( 6253 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6254 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true, 6255 /*Extract*/ false, CostKind); 6256 ScalarCost += 6257 VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind); 6258 } 6259 6260 // Compute the scalarization overhead of needed extractelement 6261 // instructions. For each of the instruction's operands, if the operand can 6262 // be scalarized, add it to the worklist; otherwise, account for the 6263 // overhead. 6264 for (Use &U : I->operands()) 6265 if (auto *J = dyn_cast<Instruction>(U.get())) { 6266 assert(VectorType::isValidElementType(J->getType()) && 6267 "Instruction has non-scalar type"); 6268 if (canBeScalarized(J)) 6269 Worklist.push_back(J); 6270 else if (needsExtract(J, VF)) { 6271 ScalarCost += TTI.getScalarizationOverhead( 6272 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6273 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false, 6274 /*Extract*/ true, CostKind); 6275 } 6276 } 6277 6278 // Scale the total scalar cost by block probability. 6279 ScalarCost /= getReciprocalPredBlockProb(); 6280 6281 // Compute the discount. A non-negative discount means the vector version 6282 // of the instruction costs more, and scalarizing would be beneficial. 6283 Discount += VectorCost - ScalarCost; 6284 ScalarCosts[I] = ScalarCost; 6285 } 6286 6287 return Discount; 6288 } 6289 6290 LoopVectorizationCostModel::VectorizationCostTy 6291 LoopVectorizationCostModel::expectedCost( 6292 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { 6293 VectorizationCostTy Cost; 6294 6295 // For each block. 6296 for (BasicBlock *BB : TheLoop->blocks()) { 6297 VectorizationCostTy BlockCost; 6298 6299 // For each instruction in the old loop. 6300 for (Instruction &I : BB->instructionsWithoutDebug()) { 6301 // Skip ignored values. 6302 if (ValuesToIgnore.count(&I) || 6303 (VF.isVector() && VecValuesToIgnore.count(&I))) 6304 continue; 6305 6306 VectorizationCostTy C = getInstructionCost(&I, VF); 6307 6308 // Check if we should override the cost. 6309 if (C.first.isValid() && 6310 ForceTargetInstructionCost.getNumOccurrences() > 0) 6311 C.first = InstructionCost(ForceTargetInstructionCost); 6312 6313 // Keep a list of instructions with invalid costs. 6314 if (Invalid && !C.first.isValid()) 6315 Invalid->emplace_back(&I, VF); 6316 6317 BlockCost.first += C.first; 6318 BlockCost.second |= C.second; 6319 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6320 << " for VF " << VF << " For instruction: " << I 6321 << '\n'); 6322 } 6323 6324 // If we are vectorizing a predicated block, it will have been 6325 // if-converted. This means that the block's instructions (aside from 6326 // stores and instructions that may divide by zero) will now be 6327 // unconditionally executed. For the scalar case, we may not always execute 6328 // the predicated block, if it is an if-else block. Thus, scale the block's 6329 // cost by the probability of executing it. blockNeedsPredication from 6330 // Legal is used so as to not include all blocks in tail folded loops. 6331 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6332 BlockCost.first /= getReciprocalPredBlockProb(); 6333 6334 Cost.first += BlockCost.first; 6335 Cost.second |= BlockCost.second; 6336 } 6337 6338 return Cost; 6339 } 6340 6341 /// Gets Address Access SCEV after verifying that the access pattern 6342 /// is loop invariant except the induction variable dependence. 6343 /// 6344 /// This SCEV can be sent to the Target in order to estimate the address 6345 /// calculation cost. 6346 static const SCEV *getAddressAccessSCEV( 6347 Value *Ptr, 6348 LoopVectorizationLegality *Legal, 6349 PredicatedScalarEvolution &PSE, 6350 const Loop *TheLoop) { 6351 6352 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6353 if (!Gep) 6354 return nullptr; 6355 6356 // We are looking for a gep with all loop invariant indices except for one 6357 // which should be an induction variable. 6358 auto SE = PSE.getSE(); 6359 unsigned NumOperands = Gep->getNumOperands(); 6360 for (unsigned i = 1; i < NumOperands; ++i) { 6361 Value *Opd = Gep->getOperand(i); 6362 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6363 !Legal->isInductionVariable(Opd)) 6364 return nullptr; 6365 } 6366 6367 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6368 return PSE.getSCEV(Ptr); 6369 } 6370 6371 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6372 return Legal->hasStride(I->getOperand(0)) || 6373 Legal->hasStride(I->getOperand(1)); 6374 } 6375 6376 InstructionCost 6377 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6378 ElementCount VF) { 6379 assert(VF.isVector() && 6380 "Scalarization cost of instruction implies vectorization."); 6381 if (VF.isScalable()) 6382 return InstructionCost::getInvalid(); 6383 6384 Type *ValTy = getLoadStoreType(I); 6385 auto SE = PSE.getSE(); 6386 6387 unsigned AS = getLoadStoreAddressSpace(I); 6388 Value *Ptr = getLoadStorePointerOperand(I); 6389 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6390 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` 6391 // that it is being called from this specific place. 6392 6393 // Figure out whether the access is strided and get the stride value 6394 // if it's known in compile time 6395 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6396 6397 // Get the cost of the scalar memory instruction and address computation. 6398 InstructionCost Cost = 6399 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6400 6401 // Don't pass *I here, since it is scalar but will actually be part of a 6402 // vectorized loop where the user of it is a vectorized instruction. 6403 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6404 const Align Alignment = getLoadStoreAlignment(I); 6405 Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(), 6406 ValTy->getScalarType(), 6407 Alignment, AS, CostKind); 6408 6409 // Get the overhead of the extractelement and insertelement instructions 6410 // we might create due to scalarization. 6411 Cost += getScalarizationOverhead(I, VF, CostKind); 6412 6413 // If we have a predicated load/store, it will need extra i1 extracts and 6414 // conditional branches, but may not be executed for each vector lane. Scale 6415 // the cost by the probability of executing the predicated block. 6416 if (isPredicatedInst(I)) { 6417 Cost /= getReciprocalPredBlockProb(); 6418 6419 // Add the cost of an i1 extract and a branch 6420 auto *Vec_i1Ty = 6421 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6422 Cost += TTI.getScalarizationOverhead( 6423 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()), 6424 /*Insert=*/false, /*Extract=*/true, CostKind); 6425 Cost += TTI.getCFInstrCost(Instruction::Br, CostKind); 6426 6427 if (useEmulatedMaskMemRefHack(I, VF)) 6428 // Artificially setting to a high enough value to practically disable 6429 // vectorization with such operations. 6430 Cost = 3000000; 6431 } 6432 6433 return Cost; 6434 } 6435 6436 InstructionCost 6437 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6438 ElementCount VF) { 6439 Type *ValTy = getLoadStoreType(I); 6440 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6441 Value *Ptr = getLoadStorePointerOperand(I); 6442 unsigned AS = getLoadStoreAddressSpace(I); 6443 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); 6444 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6445 6446 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6447 "Stride should be 1 or -1 for consecutive memory access"); 6448 const Align Alignment = getLoadStoreAlignment(I); 6449 InstructionCost Cost = 0; 6450 if (Legal->isMaskRequired(I)) { 6451 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6452 CostKind); 6453 } else { 6454 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0)); 6455 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6456 CostKind, OpInfo, I); 6457 } 6458 6459 bool Reverse = ConsecutiveStride < 0; 6460 if (Reverse) 6461 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 6462 std::nullopt, CostKind, 0); 6463 return Cost; 6464 } 6465 6466 InstructionCost 6467 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6468 ElementCount VF) { 6469 assert(Legal->isUniformMemOp(*I)); 6470 6471 Type *ValTy = getLoadStoreType(I); 6472 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6473 const Align Alignment = getLoadStoreAlignment(I); 6474 unsigned AS = getLoadStoreAddressSpace(I); 6475 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6476 if (isa<LoadInst>(I)) { 6477 return TTI.getAddressComputationCost(ValTy) + 6478 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6479 CostKind) + 6480 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6481 } 6482 StoreInst *SI = cast<StoreInst>(I); 6483 6484 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6485 return TTI.getAddressComputationCost(ValTy) + 6486 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6487 CostKind) + 6488 (isLoopInvariantStoreValue 6489 ? 0 6490 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6491 CostKind, VF.getKnownMinValue() - 1)); 6492 } 6493 6494 InstructionCost 6495 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6496 ElementCount VF) { 6497 Type *ValTy = getLoadStoreType(I); 6498 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6499 const Align Alignment = getLoadStoreAlignment(I); 6500 const Value *Ptr = getLoadStorePointerOperand(I); 6501 6502 return TTI.getAddressComputationCost(VectorTy) + 6503 TTI.getGatherScatterOpCost( 6504 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6505 TargetTransformInfo::TCK_RecipThroughput, I); 6506 } 6507 6508 InstructionCost 6509 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6510 ElementCount VF) { 6511 // TODO: Once we have support for interleaving with scalable vectors 6512 // we can calculate the cost properly here. 6513 if (VF.isScalable()) 6514 return InstructionCost::getInvalid(); 6515 6516 Type *ValTy = getLoadStoreType(I); 6517 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6518 unsigned AS = getLoadStoreAddressSpace(I); 6519 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6520 6521 auto Group = getInterleavedAccessGroup(I); 6522 assert(Group && "Fail to get an interleaved access group."); 6523 6524 unsigned InterleaveFactor = Group->getFactor(); 6525 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6526 6527 // Holds the indices of existing members in the interleaved group. 6528 SmallVector<unsigned, 4> Indices; 6529 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 6530 if (Group->getMember(IF)) 6531 Indices.push_back(IF); 6532 6533 // Calculate the cost of the whole interleaved group. 6534 bool UseMaskForGaps = 6535 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || 6536 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); 6537 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 6538 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6539 AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps); 6540 6541 if (Group->isReverse()) { 6542 // TODO: Add support for reversed masked interleaved access. 6543 assert(!Legal->isMaskRequired(I) && 6544 "Reverse masked interleaved access not supported."); 6545 Cost += Group->getNumMembers() * 6546 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 6547 std::nullopt, CostKind, 0); 6548 } 6549 return Cost; 6550 } 6551 6552 std::optional<InstructionCost> 6553 LoopVectorizationCostModel::getReductionPatternCost( 6554 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 6555 using namespace llvm::PatternMatch; 6556 // Early exit for no inloop reductions 6557 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 6558 return std::nullopt; 6559 auto *VectorTy = cast<VectorType>(Ty); 6560 6561 // We are looking for a pattern of, and finding the minimal acceptable cost: 6562 // reduce(mul(ext(A), ext(B))) or 6563 // reduce(mul(A, B)) or 6564 // reduce(ext(A)) or 6565 // reduce(A). 6566 // The basic idea is that we walk down the tree to do that, finding the root 6567 // reduction instruction in InLoopReductionImmediateChains. From there we find 6568 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 6569 // of the components. If the reduction cost is lower then we return it for the 6570 // reduction instruction and 0 for the other instructions in the pattern. If 6571 // it is not we return an invalid cost specifying the orignal cost method 6572 // should be used. 6573 Instruction *RetI = I; 6574 if (match(RetI, m_ZExtOrSExt(m_Value()))) { 6575 if (!RetI->hasOneUser()) 6576 return std::nullopt; 6577 RetI = RetI->user_back(); 6578 } 6579 6580 if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) && 6581 RetI->user_back()->getOpcode() == Instruction::Add) { 6582 RetI = RetI->user_back(); 6583 } 6584 6585 // Test if the found instruction is a reduction, and if not return an invalid 6586 // cost specifying the parent to use the original cost modelling. 6587 if (!InLoopReductionImmediateChains.count(RetI)) 6588 return std::nullopt; 6589 6590 // Find the reduction this chain is a part of and calculate the basic cost of 6591 // the reduction on its own. 6592 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 6593 Instruction *ReductionPhi = LastChain; 6594 while (!isa<PHINode>(ReductionPhi)) 6595 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 6596 6597 const RecurrenceDescriptor &RdxDesc = 6598 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second; 6599 6600 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 6601 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); 6602 6603 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a 6604 // normal fmul instruction to the cost of the fadd reduction. 6605 if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd) 6606 BaseCost += 6607 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind); 6608 6609 // If we're using ordered reductions then we can just return the base cost 6610 // here, since getArithmeticReductionCost calculates the full ordered 6611 // reduction cost when FP reassociation is not allowed. 6612 if (useOrderedReductions(RdxDesc)) 6613 return BaseCost; 6614 6615 // Get the operand that was not the reduction chain and match it to one of the 6616 // patterns, returning the better cost if it is found. 6617 Instruction *RedOp = RetI->getOperand(1) == LastChain 6618 ? dyn_cast<Instruction>(RetI->getOperand(0)) 6619 : dyn_cast<Instruction>(RetI->getOperand(1)); 6620 6621 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 6622 6623 Instruction *Op0, *Op1; 6624 if (RedOp && RdxDesc.getOpcode() == Instruction::Add && 6625 match(RedOp, 6626 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && 6627 match(Op0, m_ZExtOrSExt(m_Value())) && 6628 Op0->getOpcode() == Op1->getOpcode() && 6629 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 6630 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && 6631 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { 6632 6633 // Matched reduce.add(ext(mul(ext(A), ext(B))) 6634 // Note that the extend opcodes need to all match, or if A==B they will have 6635 // been converted to zext(mul(sext(A), sext(A))) as it is known positive, 6636 // which is equally fine. 6637 bool IsUnsigned = isa<ZExtInst>(Op0); 6638 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 6639 auto *MulType = VectorType::get(Op0->getType(), VectorTy); 6640 6641 InstructionCost ExtCost = 6642 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, 6643 TTI::CastContextHint::None, CostKind, Op0); 6644 InstructionCost MulCost = 6645 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); 6646 InstructionCost Ext2Cost = 6647 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, 6648 TTI::CastContextHint::None, CostKind, RedOp); 6649 6650 InstructionCost RedCost = TTI.getMulAccReductionCost( 6651 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind); 6652 6653 if (RedCost.isValid() && 6654 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) 6655 return I == RetI ? RedCost : 0; 6656 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && 6657 !TheLoop->isLoopInvariant(RedOp)) { 6658 // Matched reduce(ext(A)) 6659 bool IsUnsigned = isa<ZExtInst>(RedOp); 6660 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 6661 InstructionCost RedCost = TTI.getExtendedReductionCost( 6662 RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6663 RdxDesc.getFastMathFlags(), CostKind); 6664 6665 InstructionCost ExtCost = 6666 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 6667 TTI::CastContextHint::None, CostKind, RedOp); 6668 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 6669 return I == RetI ? RedCost : 0; 6670 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add && 6671 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { 6672 if (match(Op0, m_ZExtOrSExt(m_Value())) && 6673 Op0->getOpcode() == Op1->getOpcode() && 6674 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 6675 bool IsUnsigned = isa<ZExtInst>(Op0); 6676 Type *Op0Ty = Op0->getOperand(0)->getType(); 6677 Type *Op1Ty = Op1->getOperand(0)->getType(); 6678 Type *LargestOpTy = 6679 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty 6680 : Op0Ty; 6681 auto *ExtType = VectorType::get(LargestOpTy, VectorTy); 6682 6683 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of 6684 // different sizes. We take the largest type as the ext to reduce, and add 6685 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))). 6686 InstructionCost ExtCost0 = TTI.getCastInstrCost( 6687 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy), 6688 TTI::CastContextHint::None, CostKind, Op0); 6689 InstructionCost ExtCost1 = TTI.getCastInstrCost( 6690 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy), 6691 TTI::CastContextHint::None, CostKind, Op1); 6692 InstructionCost MulCost = 6693 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6694 6695 InstructionCost RedCost = TTI.getMulAccReductionCost( 6696 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind); 6697 InstructionCost ExtraExtCost = 0; 6698 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { 6699 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1; 6700 ExtraExtCost = TTI.getCastInstrCost( 6701 ExtraExtOp->getOpcode(), ExtType, 6702 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy), 6703 TTI::CastContextHint::None, CostKind, ExtraExtOp); 6704 } 6705 6706 if (RedCost.isValid() && 6707 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost)) 6708 return I == RetI ? RedCost : 0; 6709 } else if (!match(I, m_ZExtOrSExt(m_Value()))) { 6710 // Matched reduce.add(mul()) 6711 InstructionCost MulCost = 6712 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6713 6714 InstructionCost RedCost = TTI.getMulAccReductionCost( 6715 true, RdxDesc.getRecurrenceType(), VectorTy, CostKind); 6716 6717 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 6718 return I == RetI ? RedCost : 0; 6719 } 6720 } 6721 6722 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt; 6723 } 6724 6725 InstructionCost 6726 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 6727 ElementCount VF) { 6728 // Calculate scalar cost only. Vectorization cost should be ready at this 6729 // moment. 6730 if (VF.isScalar()) { 6731 Type *ValTy = getLoadStoreType(I); 6732 const Align Alignment = getLoadStoreAlignment(I); 6733 unsigned AS = getLoadStoreAddressSpace(I); 6734 6735 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0)); 6736 return TTI.getAddressComputationCost(ValTy) + 6737 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 6738 TTI::TCK_RecipThroughput, OpInfo, I); 6739 } 6740 return getWideningCost(I, VF); 6741 } 6742 6743 LoopVectorizationCostModel::VectorizationCostTy 6744 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6745 ElementCount VF) { 6746 // If we know that this instruction will remain uniform, check the cost of 6747 // the scalar version. 6748 if (isUniformAfterVectorization(I, VF)) 6749 VF = ElementCount::getFixed(1); 6750 6751 if (VF.isVector() && isProfitableToScalarize(I, VF)) 6752 return VectorizationCostTy(InstsToScalarize[VF][I], false); 6753 6754 // Forced scalars do not have any scalarization overhead. 6755 auto ForcedScalar = ForcedScalars.find(VF); 6756 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 6757 auto InstSet = ForcedScalar->second; 6758 if (InstSet.count(I)) 6759 return VectorizationCostTy( 6760 (getInstructionCost(I, ElementCount::getFixed(1)).first * 6761 VF.getKnownMinValue()), 6762 false); 6763 } 6764 6765 Type *VectorTy; 6766 InstructionCost C = getInstructionCost(I, VF, VectorTy); 6767 6768 bool TypeNotScalarized = false; 6769 if (VF.isVector() && VectorTy->isVectorTy()) { 6770 if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) { 6771 if (VF.isScalable()) 6772 // <vscale x 1 x iN> is assumed to be profitable over iN because 6773 // scalable registers are a distinct register class from scalar ones. 6774 // If we ever find a target which wants to lower scalable vectors 6775 // back to scalars, we'll need to update this code to explicitly 6776 // ask TTI about the register class uses for each part. 6777 TypeNotScalarized = NumParts <= VF.getKnownMinValue(); 6778 else 6779 TypeNotScalarized = NumParts < VF.getKnownMinValue(); 6780 } else 6781 C = InstructionCost::getInvalid(); 6782 } 6783 return VectorizationCostTy(C, TypeNotScalarized); 6784 } 6785 6786 InstructionCost LoopVectorizationCostModel::getScalarizationOverhead( 6787 Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const { 6788 6789 // There is no mechanism yet to create a scalable scalarization loop, 6790 // so this is currently Invalid. 6791 if (VF.isScalable()) 6792 return InstructionCost::getInvalid(); 6793 6794 if (VF.isScalar()) 6795 return 0; 6796 6797 InstructionCost Cost = 0; 6798 Type *RetTy = ToVectorTy(I->getType(), VF); 6799 if (!RetTy->isVoidTy() && 6800 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6801 Cost += TTI.getScalarizationOverhead( 6802 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), 6803 /*Insert*/ true, 6804 /*Extract*/ false, CostKind); 6805 6806 // Some targets keep addresses scalar. 6807 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6808 return Cost; 6809 6810 // Some targets support efficient element stores. 6811 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6812 return Cost; 6813 6814 // Collect operands to consider. 6815 CallInst *CI = dyn_cast<CallInst>(I); 6816 Instruction::op_range Ops = CI ? CI->args() : I->operands(); 6817 6818 // Skip operands that do not require extraction/scalarization and do not incur 6819 // any overhead. 6820 SmallVector<Type *> Tys; 6821 for (auto *V : filterExtractingOperands(Ops, VF)) 6822 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 6823 return Cost + TTI.getOperandsScalarizationOverhead( 6824 filterExtractingOperands(Ops, VF), Tys, CostKind); 6825 } 6826 6827 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 6828 if (VF.isScalar()) 6829 return; 6830 NumPredStores = 0; 6831 for (BasicBlock *BB : TheLoop->blocks()) { 6832 // For each instruction in the old loop. 6833 for (Instruction &I : *BB) { 6834 Value *Ptr = getLoadStorePointerOperand(&I); 6835 if (!Ptr) 6836 continue; 6837 6838 // TODO: We should generate better code and update the cost model for 6839 // predicated uniform stores. Today they are treated as any other 6840 // predicated store (see added test cases in 6841 // invariant-store-vectorization.ll). 6842 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF)) 6843 NumPredStores++; 6844 6845 if (Legal->isUniformMemOp(I)) { 6846 auto isLegalToScalarize = [&]() { 6847 if (!VF.isScalable()) 6848 // Scalarization of fixed length vectors "just works". 6849 return true; 6850 6851 // We have dedicated lowering for unpredicated uniform loads and 6852 // stores. Note that even with tail folding we know that at least 6853 // one lane is active (i.e. generalized predication is not possible 6854 // here), and the logic below depends on this fact. 6855 if (!foldTailByMasking()) 6856 return true; 6857 6858 // For scalable vectors, a uniform memop load is always 6859 // uniform-by-parts and we know how to scalarize that. 6860 if (isa<LoadInst>(I)) 6861 return true; 6862 6863 // A uniform store isn't neccessarily uniform-by-part 6864 // and we can't assume scalarization. 6865 auto &SI = cast<StoreInst>(I); 6866 return TheLoop->isLoopInvariant(SI.getValueOperand()); 6867 }; 6868 6869 const InstructionCost GatherScatterCost = 6870 isLegalGatherOrScatter(&I, VF) ? 6871 getGatherScatterCost(&I, VF) : InstructionCost::getInvalid(); 6872 6873 // Load: Scalar load + broadcast 6874 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6875 // FIXME: This cost is a significant under-estimate for tail folded 6876 // memory ops. 6877 const InstructionCost ScalarizationCost = isLegalToScalarize() ? 6878 getUniformMemOpCost(&I, VF) : InstructionCost::getInvalid(); 6879 6880 // Choose better solution for the current VF, Note that Invalid 6881 // costs compare as maximumal large. If both are invalid, we get 6882 // scalable invalid which signals a failure and a vectorization abort. 6883 if (GatherScatterCost < ScalarizationCost) 6884 setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost); 6885 else 6886 setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost); 6887 continue; 6888 } 6889 6890 // We assume that widening is the best solution when possible. 6891 if (memoryInstructionCanBeWidened(&I, VF)) { 6892 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 6893 int ConsecutiveStride = Legal->isConsecutivePtr( 6894 getLoadStoreType(&I), getLoadStorePointerOperand(&I)); 6895 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6896 "Expected consecutive stride."); 6897 InstWidening Decision = 6898 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6899 setWideningDecision(&I, VF, Decision, Cost); 6900 continue; 6901 } 6902 6903 // Choose between Interleaving, Gather/Scatter or Scalarization. 6904 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 6905 unsigned NumAccesses = 1; 6906 if (isAccessInterleaved(&I)) { 6907 auto Group = getInterleavedAccessGroup(&I); 6908 assert(Group && "Fail to get an interleaved access group."); 6909 6910 // Make one decision for the whole group. 6911 if (getWideningDecision(&I, VF) != CM_Unknown) 6912 continue; 6913 6914 NumAccesses = Group->getNumMembers(); 6915 if (interleavedAccessCanBeWidened(&I, VF)) 6916 InterleaveCost = getInterleaveGroupCost(&I, VF); 6917 } 6918 6919 InstructionCost GatherScatterCost = 6920 isLegalGatherOrScatter(&I, VF) 6921 ? getGatherScatterCost(&I, VF) * NumAccesses 6922 : InstructionCost::getInvalid(); 6923 6924 InstructionCost ScalarizationCost = 6925 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6926 6927 // Choose better solution for the current VF, 6928 // write down this decision and use it during vectorization. 6929 InstructionCost Cost; 6930 InstWidening Decision; 6931 if (InterleaveCost <= GatherScatterCost && 6932 InterleaveCost < ScalarizationCost) { 6933 Decision = CM_Interleave; 6934 Cost = InterleaveCost; 6935 } else if (GatherScatterCost < ScalarizationCost) { 6936 Decision = CM_GatherScatter; 6937 Cost = GatherScatterCost; 6938 } else { 6939 Decision = CM_Scalarize; 6940 Cost = ScalarizationCost; 6941 } 6942 // If the instructions belongs to an interleave group, the whole group 6943 // receives the same decision. The whole group receives the cost, but 6944 // the cost will actually be assigned to one instruction. 6945 if (auto Group = getInterleavedAccessGroup(&I)) 6946 setWideningDecision(Group, VF, Decision, Cost); 6947 else 6948 setWideningDecision(&I, VF, Decision, Cost); 6949 } 6950 } 6951 6952 // Make sure that any load of address and any other address computation 6953 // remains scalar unless there is gather/scatter support. This avoids 6954 // inevitable extracts into address registers, and also has the benefit of 6955 // activating LSR more, since that pass can't optimize vectorized 6956 // addresses. 6957 if (TTI.prefersVectorizedAddressing()) 6958 return; 6959 6960 // Start with all scalar pointer uses. 6961 SmallPtrSet<Instruction *, 8> AddrDefs; 6962 for (BasicBlock *BB : TheLoop->blocks()) 6963 for (Instruction &I : *BB) { 6964 Instruction *PtrDef = 6965 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6966 if (PtrDef && TheLoop->contains(PtrDef) && 6967 getWideningDecision(&I, VF) != CM_GatherScatter) 6968 AddrDefs.insert(PtrDef); 6969 } 6970 6971 // Add all instructions used to generate the addresses. 6972 SmallVector<Instruction *, 4> Worklist; 6973 append_range(Worklist, AddrDefs); 6974 while (!Worklist.empty()) { 6975 Instruction *I = Worklist.pop_back_val(); 6976 for (auto &Op : I->operands()) 6977 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6978 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6979 AddrDefs.insert(InstOp).second) 6980 Worklist.push_back(InstOp); 6981 } 6982 6983 for (auto *I : AddrDefs) { 6984 if (isa<LoadInst>(I)) { 6985 // Setting the desired widening decision should ideally be handled in 6986 // by cost functions, but since this involves the task of finding out 6987 // if the loaded register is involved in an address computation, it is 6988 // instead changed here when we know this is the case. 6989 InstWidening Decision = getWideningDecision(I, VF); 6990 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6991 // Scalarize a widened load of address. 6992 setWideningDecision( 6993 I, VF, CM_Scalarize, 6994 (VF.getKnownMinValue() * 6995 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 6996 else if (auto Group = getInterleavedAccessGroup(I)) { 6997 // Scalarize an interleave group of address loads. 6998 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6999 if (Instruction *Member = Group->getMember(I)) 7000 setWideningDecision( 7001 Member, VF, CM_Scalarize, 7002 (VF.getKnownMinValue() * 7003 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7004 } 7005 } 7006 } else 7007 // Make sure I gets scalarized and a cost estimate without 7008 // scalarization overhead. 7009 ForcedScalars[VF].insert(I); 7010 } 7011 } 7012 7013 InstructionCost 7014 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7015 Type *&VectorTy) { 7016 Type *RetTy = I->getType(); 7017 if (canTruncateToMinimalBitwidth(I, VF)) 7018 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7019 auto SE = PSE.getSE(); 7020 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7021 7022 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 7023 ElementCount VF) -> bool { 7024 if (VF.isScalar()) 7025 return true; 7026 7027 auto Scalarized = InstsToScalarize.find(VF); 7028 assert(Scalarized != InstsToScalarize.end() && 7029 "VF not yet analyzed for scalarization profitability"); 7030 return !Scalarized->second.count(I) && 7031 llvm::all_of(I->users(), [&](User *U) { 7032 auto *UI = cast<Instruction>(U); 7033 return !Scalarized->second.count(UI); 7034 }); 7035 }; 7036 (void) hasSingleCopyAfterVectorization; 7037 7038 if (isScalarAfterVectorization(I, VF)) { 7039 // With the exception of GEPs and PHIs, after scalarization there should 7040 // only be one copy of the instruction generated in the loop. This is 7041 // because the VF is either 1, or any instructions that need scalarizing 7042 // have already been dealt with by the the time we get here. As a result, 7043 // it means we don't have to multiply the instruction cost by VF. 7044 assert(I->getOpcode() == Instruction::GetElementPtr || 7045 I->getOpcode() == Instruction::PHI || 7046 (I->getOpcode() == Instruction::BitCast && 7047 I->getType()->isPointerTy()) || 7048 hasSingleCopyAfterVectorization(I, VF)); 7049 VectorTy = RetTy; 7050 } else 7051 VectorTy = ToVectorTy(RetTy, VF); 7052 7053 // TODO: We need to estimate the cost of intrinsic calls. 7054 switch (I->getOpcode()) { 7055 case Instruction::GetElementPtr: 7056 // We mark this instruction as zero-cost because the cost of GEPs in 7057 // vectorized code depends on whether the corresponding memory instruction 7058 // is scalarized or not. Therefore, we handle GEPs with the memory 7059 // instruction cost. 7060 return 0; 7061 case Instruction::Br: { 7062 // In cases of scalarized and predicated instructions, there will be VF 7063 // predicated blocks in the vectorized loop. Each branch around these 7064 // blocks requires also an extract of its vector compare i1 element. 7065 bool ScalarPredicatedBB = false; 7066 BranchInst *BI = cast<BranchInst>(I); 7067 if (VF.isVector() && BI->isConditional() && 7068 (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) || 7069 PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1)))) 7070 ScalarPredicatedBB = true; 7071 7072 if (ScalarPredicatedBB) { 7073 // Not possible to scalarize scalable vector with predicated instructions. 7074 if (VF.isScalable()) 7075 return InstructionCost::getInvalid(); 7076 // Return cost for branches around scalarized and predicated blocks. 7077 auto *Vec_i1Ty = 7078 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7079 return ( 7080 TTI.getScalarizationOverhead( 7081 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), 7082 /*Insert*/ false, /*Extract*/ true, CostKind) + 7083 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); 7084 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7085 // The back-edge branch will remain, as will all scalar branches. 7086 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7087 else 7088 // This branch will be eliminated by if-conversion. 7089 return 0; 7090 // Note: We currently assume zero cost for an unconditional branch inside 7091 // a predicated block since it will become a fall-through, although we 7092 // may decide in the future to call TTI for all branches. 7093 } 7094 case Instruction::PHI: { 7095 auto *Phi = cast<PHINode>(I); 7096 7097 // First-order recurrences are replaced by vector shuffles inside the loop. 7098 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) { 7099 SmallVector<int> Mask(VF.getKnownMinValue()); 7100 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1); 7101 return TTI.getShuffleCost(TargetTransformInfo::SK_Splice, 7102 cast<VectorType>(VectorTy), Mask, CostKind, 7103 VF.getKnownMinValue() - 1); 7104 } 7105 7106 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7107 // converted into select instructions. We require N - 1 selects per phi 7108 // node, where N is the number of incoming values. 7109 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7110 return (Phi->getNumIncomingValues() - 1) * 7111 TTI.getCmpSelInstrCost( 7112 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7113 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7114 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7115 7116 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7117 } 7118 case Instruction::UDiv: 7119 case Instruction::SDiv: 7120 case Instruction::URem: 7121 case Instruction::SRem: 7122 if (VF.isVector() && isPredicatedInst(I)) { 7123 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF); 7124 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ? 7125 ScalarCost : SafeDivisorCost; 7126 } 7127 // We've proven all lanes safe to speculate, fall through. 7128 [[fallthrough]]; 7129 case Instruction::Add: 7130 case Instruction::FAdd: 7131 case Instruction::Sub: 7132 case Instruction::FSub: 7133 case Instruction::Mul: 7134 case Instruction::FMul: 7135 case Instruction::FDiv: 7136 case Instruction::FRem: 7137 case Instruction::Shl: 7138 case Instruction::LShr: 7139 case Instruction::AShr: 7140 case Instruction::And: 7141 case Instruction::Or: 7142 case Instruction::Xor: { 7143 // Since we will replace the stride by 1 the multiplication should go away. 7144 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7145 return 0; 7146 7147 // Detect reduction patterns 7148 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7149 return *RedCost; 7150 7151 // Certain instructions can be cheaper to vectorize if they have a constant 7152 // second vector operand. One example of this are shifts on x86. 7153 Value *Op2 = I->getOperand(1); 7154 auto Op2Info = TTI.getOperandInfo(Op2); 7155 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7156 Op2Info.Kind = TargetTransformInfo::OK_UniformValue; 7157 7158 SmallVector<const Value *, 4> Operands(I->operand_values()); 7159 return TTI.getArithmeticInstrCost( 7160 I->getOpcode(), VectorTy, CostKind, 7161 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 7162 Op2Info, Operands, I); 7163 } 7164 case Instruction::FNeg: { 7165 return TTI.getArithmeticInstrCost( 7166 I->getOpcode(), VectorTy, CostKind, 7167 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 7168 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 7169 I->getOperand(0), I); 7170 } 7171 case Instruction::Select: { 7172 SelectInst *SI = cast<SelectInst>(I); 7173 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7174 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7175 7176 const Value *Op0, *Op1; 7177 using namespace llvm::PatternMatch; 7178 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7179 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7180 // select x, y, false --> x & y 7181 // select x, true, y --> x | y 7182 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0); 7183 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1); 7184 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7185 Op1->getType()->getScalarSizeInBits() == 1); 7186 7187 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7188 return TTI.getArithmeticInstrCost( 7189 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7190 CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I); 7191 } 7192 7193 Type *CondTy = SI->getCondition()->getType(); 7194 if (!ScalarCond) 7195 CondTy = VectorType::get(CondTy, VF); 7196 7197 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 7198 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition())) 7199 Pred = Cmp->getPredicate(); 7200 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred, 7201 CostKind, I); 7202 } 7203 case Instruction::ICmp: 7204 case Instruction::FCmp: { 7205 Type *ValTy = I->getOperand(0)->getType(); 7206 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7207 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7208 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7209 VectorTy = ToVectorTy(ValTy, VF); 7210 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7211 cast<CmpInst>(I)->getPredicate(), CostKind, 7212 I); 7213 } 7214 case Instruction::Store: 7215 case Instruction::Load: { 7216 ElementCount Width = VF; 7217 if (Width.isVector()) { 7218 InstWidening Decision = getWideningDecision(I, Width); 7219 assert(Decision != CM_Unknown && 7220 "CM decision should be taken at this point"); 7221 if (getWideningCost(I, VF) == InstructionCost::getInvalid()) 7222 return InstructionCost::getInvalid(); 7223 if (Decision == CM_Scalarize) 7224 Width = ElementCount::getFixed(1); 7225 } 7226 VectorTy = ToVectorTy(getLoadStoreType(I), Width); 7227 return getMemoryInstructionCost(I, VF); 7228 } 7229 case Instruction::BitCast: 7230 if (I->getType()->isPointerTy()) 7231 return 0; 7232 [[fallthrough]]; 7233 case Instruction::ZExt: 7234 case Instruction::SExt: 7235 case Instruction::FPToUI: 7236 case Instruction::FPToSI: 7237 case Instruction::FPExt: 7238 case Instruction::PtrToInt: 7239 case Instruction::IntToPtr: 7240 case Instruction::SIToFP: 7241 case Instruction::UIToFP: 7242 case Instruction::Trunc: 7243 case Instruction::FPTrunc: { 7244 // Computes the CastContextHint from a Load/Store instruction. 7245 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7246 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7247 "Expected a load or a store!"); 7248 7249 if (VF.isScalar() || !TheLoop->contains(I)) 7250 return TTI::CastContextHint::Normal; 7251 7252 switch (getWideningDecision(I, VF)) { 7253 case LoopVectorizationCostModel::CM_GatherScatter: 7254 return TTI::CastContextHint::GatherScatter; 7255 case LoopVectorizationCostModel::CM_Interleave: 7256 return TTI::CastContextHint::Interleave; 7257 case LoopVectorizationCostModel::CM_Scalarize: 7258 case LoopVectorizationCostModel::CM_Widen: 7259 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7260 : TTI::CastContextHint::Normal; 7261 case LoopVectorizationCostModel::CM_Widen_Reverse: 7262 return TTI::CastContextHint::Reversed; 7263 case LoopVectorizationCostModel::CM_Unknown: 7264 llvm_unreachable("Instr did not go through cost modelling?"); 7265 } 7266 7267 llvm_unreachable("Unhandled case!"); 7268 }; 7269 7270 unsigned Opcode = I->getOpcode(); 7271 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7272 // For Trunc, the context is the only user, which must be a StoreInst. 7273 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7274 if (I->hasOneUse()) 7275 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7276 CCH = ComputeCCH(Store); 7277 } 7278 // For Z/Sext, the context is the operand, which must be a LoadInst. 7279 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7280 Opcode == Instruction::FPExt) { 7281 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7282 CCH = ComputeCCH(Load); 7283 } 7284 7285 // We optimize the truncation of induction variables having constant 7286 // integer steps. The cost of these truncations is the same as the scalar 7287 // operation. 7288 if (isOptimizableIVTruncate(I, VF)) { 7289 auto *Trunc = cast<TruncInst>(I); 7290 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7291 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7292 } 7293 7294 // Detect reduction patterns 7295 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7296 return *RedCost; 7297 7298 Type *SrcScalarTy = I->getOperand(0)->getType(); 7299 Type *SrcVecTy = 7300 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7301 if (canTruncateToMinimalBitwidth(I, VF)) { 7302 // This cast is going to be shrunk. This may remove the cast or it might 7303 // turn it into slightly different cast. For example, if MinBW == 16, 7304 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7305 // 7306 // Calculate the modified src and dest types. 7307 Type *MinVecTy = VectorTy; 7308 if (Opcode == Instruction::Trunc) { 7309 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7310 VectorTy = 7311 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7312 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7313 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7314 VectorTy = 7315 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7316 } 7317 } 7318 7319 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7320 } 7321 case Instruction::Call: { 7322 if (RecurrenceDescriptor::isFMulAddIntrinsic(I)) 7323 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7324 return *RedCost; 7325 bool NeedToScalarize; 7326 CallInst *CI = cast<CallInst>(I); 7327 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7328 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7329 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7330 return std::min(CallCost, IntrinsicCost); 7331 } 7332 return CallCost; 7333 } 7334 case Instruction::ExtractValue: 7335 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7336 case Instruction::Alloca: 7337 // We cannot easily widen alloca to a scalable alloca, as 7338 // the result would need to be a vector of pointers. 7339 if (VF.isScalable()) 7340 return InstructionCost::getInvalid(); 7341 [[fallthrough]]; 7342 default: 7343 // This opcode is unknown. Assume that it is the same as 'mul'. 7344 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7345 } // end of switch. 7346 } 7347 7348 char LoopVectorize::ID = 0; 7349 7350 static const char lv_name[] = "Loop Vectorization"; 7351 7352 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7353 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7354 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7355 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7356 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7357 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7358 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7359 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7360 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7361 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7362 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7363 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7364 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7365 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7366 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7367 7368 namespace llvm { 7369 7370 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7371 7372 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7373 bool VectorizeOnlyWhenForced) { 7374 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7375 } 7376 7377 } // end namespace llvm 7378 7379 void LoopVectorizationCostModel::collectValuesToIgnore() { 7380 // Ignore ephemeral values. 7381 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7382 7383 // Find all stores to invariant variables. Since they are going to sink 7384 // outside the loop we do not need calculate cost for them. 7385 for (BasicBlock *BB : TheLoop->blocks()) 7386 for (Instruction &I : *BB) { 7387 StoreInst *SI; 7388 if ((SI = dyn_cast<StoreInst>(&I)) && 7389 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) 7390 ValuesToIgnore.insert(&I); 7391 } 7392 7393 // Ignore type-promoting instructions we identified during reduction 7394 // detection. 7395 for (const auto &Reduction : Legal->getReductionVars()) { 7396 const RecurrenceDescriptor &RedDes = Reduction.second; 7397 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7398 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7399 } 7400 // Ignore type-casting instructions we identified during induction 7401 // detection. 7402 for (const auto &Induction : Legal->getInductionVars()) { 7403 const InductionDescriptor &IndDes = Induction.second; 7404 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7405 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7406 } 7407 } 7408 7409 void LoopVectorizationCostModel::collectInLoopReductions() { 7410 for (const auto &Reduction : Legal->getReductionVars()) { 7411 PHINode *Phi = Reduction.first; 7412 const RecurrenceDescriptor &RdxDesc = Reduction.second; 7413 7414 // We don't collect reductions that are type promoted (yet). 7415 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7416 continue; 7417 7418 // If the target would prefer this reduction to happen "in-loop", then we 7419 // want to record it as such. 7420 unsigned Opcode = RdxDesc.getOpcode(); 7421 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7422 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7423 TargetTransformInfo::ReductionFlags())) 7424 continue; 7425 7426 // Check that we can correctly put the reductions into the loop, by 7427 // finding the chain of operations that leads from the phi to the loop 7428 // exit value. 7429 SmallVector<Instruction *, 4> ReductionOperations = 7430 RdxDesc.getReductionOpChain(Phi, TheLoop); 7431 bool InLoop = !ReductionOperations.empty(); 7432 if (InLoop) { 7433 InLoopReductionChains[Phi] = ReductionOperations; 7434 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7435 Instruction *LastChain = Phi; 7436 for (auto *I : ReductionOperations) { 7437 InLoopReductionImmediateChains[I] = LastChain; 7438 LastChain = I; 7439 } 7440 } 7441 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7442 << " reduction for phi: " << *Phi << "\n"); 7443 } 7444 } 7445 7446 // TODO: we could return a pair of values that specify the max VF and 7447 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7448 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7449 // doesn't have a cost model that can choose which plan to execute if 7450 // more than one is generated. 7451 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7452 LoopVectorizationCostModel &CM) { 7453 unsigned WidestType; 7454 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7455 return WidestVectorRegBits / WidestType; 7456 } 7457 7458 VectorizationFactor 7459 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7460 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7461 ElementCount VF = UserVF; 7462 // Outer loop handling: They may require CFG and instruction level 7463 // transformations before even evaluating whether vectorization is profitable. 7464 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7465 // the vectorization pipeline. 7466 if (!OrigLoop->isInnermost()) { 7467 // If the user doesn't provide a vectorization factor, determine a 7468 // reasonable one. 7469 if (UserVF.isZero()) { 7470 VF = ElementCount::getFixed(determineVPlanVF( 7471 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 7472 .getFixedValue(), 7473 CM)); 7474 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7475 7476 // Make sure we have a VF > 1 for stress testing. 7477 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7478 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7479 << "overriding computed VF.\n"); 7480 VF = ElementCount::getFixed(4); 7481 } 7482 } 7483 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7484 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7485 "VF needs to be a power of two"); 7486 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7487 << "VF " << VF << " to build VPlans.\n"); 7488 buildVPlans(VF, VF); 7489 7490 // For VPlan build stress testing, we bail out after VPlan construction. 7491 if (VPlanBuildStressTest) 7492 return VectorizationFactor::Disabled(); 7493 7494 return {VF, 0 /*Cost*/, 0 /* ScalarCost */}; 7495 } 7496 7497 LLVM_DEBUG( 7498 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7499 "VPlan-native path.\n"); 7500 return VectorizationFactor::Disabled(); 7501 } 7502 7503 std::optional<VectorizationFactor> 7504 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7505 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7506 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 7507 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 7508 return std::nullopt; 7509 7510 // Invalidate interleave groups if all blocks of loop will be predicated. 7511 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && 7512 !useMaskedInterleavedAccesses(*TTI)) { 7513 LLVM_DEBUG( 7514 dbgs() 7515 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7516 "which requires masked-interleaved support.\n"); 7517 if (CM.InterleaveInfo.invalidateGroups()) 7518 // Invalidating interleave groups also requires invalidating all decisions 7519 // based on them, which includes widening decisions and uniform and scalar 7520 // values. 7521 CM.invalidateCostModelingDecisions(); 7522 } 7523 7524 ElementCount MaxUserVF = 7525 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 7526 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); 7527 if (!UserVF.isZero() && UserVFIsLegal) { 7528 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 7529 "VF needs to be a power of two"); 7530 // Collect the instructions (and their associated costs) that will be more 7531 // profitable to scalarize. 7532 if (CM.selectUserVectorizationFactor(UserVF)) { 7533 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 7534 CM.collectInLoopReductions(); 7535 buildVPlansWithVPRecipes(UserVF, UserVF); 7536 LLVM_DEBUG(printPlans(dbgs())); 7537 return {{UserVF, 0, 0}}; 7538 } else 7539 reportVectorizationInfo("UserVF ignored because of invalid costs.", 7540 "InvalidCost", ORE, OrigLoop); 7541 } 7542 7543 // Populate the set of Vectorization Factor Candidates. 7544 ElementCountSet VFCandidates; 7545 for (auto VF = ElementCount::getFixed(1); 7546 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 7547 VFCandidates.insert(VF); 7548 for (auto VF = ElementCount::getScalable(1); 7549 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 7550 VFCandidates.insert(VF); 7551 7552 for (const auto &VF : VFCandidates) { 7553 // Collect Uniform and Scalar instructions after vectorization with VF. 7554 CM.collectUniformsAndScalars(VF); 7555 7556 // Collect the instructions (and their associated costs) that will be more 7557 // profitable to scalarize. 7558 if (VF.isVector()) 7559 CM.collectInstsToScalarize(VF); 7560 } 7561 7562 CM.collectInLoopReductions(); 7563 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 7564 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 7565 7566 LLVM_DEBUG(printPlans(dbgs())); 7567 if (!MaxFactors.hasVector()) 7568 return VectorizationFactor::Disabled(); 7569 7570 // Select the optimal vectorization factor. 7571 VectorizationFactor VF = CM.selectVectorizationFactor(VFCandidates); 7572 assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero."); 7573 return VF; 7574 } 7575 7576 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { 7577 assert(count_if(VPlans, 7578 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 7579 1 && 7580 "Best VF has not a single VPlan."); 7581 7582 for (const VPlanPtr &Plan : VPlans) { 7583 if (Plan->hasVF(VF)) 7584 return *Plan.get(); 7585 } 7586 llvm_unreachable("No plan found!"); 7587 } 7588 7589 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7590 SmallVector<Metadata *, 4> MDs; 7591 // Reserve first location for self reference to the LoopID metadata node. 7592 MDs.push_back(nullptr); 7593 bool IsUnrollMetadata = false; 7594 MDNode *LoopID = L->getLoopID(); 7595 if (LoopID) { 7596 // First find existing loop unrolling disable metadata. 7597 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7598 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7599 if (MD) { 7600 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7601 IsUnrollMetadata = 7602 S && S->getString().startswith("llvm.loop.unroll.disable"); 7603 } 7604 MDs.push_back(LoopID->getOperand(i)); 7605 } 7606 } 7607 7608 if (!IsUnrollMetadata) { 7609 // Add runtime unroll disable metadata. 7610 LLVMContext &Context = L->getHeader()->getContext(); 7611 SmallVector<Metadata *, 1> DisableOperands; 7612 DisableOperands.push_back( 7613 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7614 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7615 MDs.push_back(DisableNode); 7616 MDNode *NewLoopID = MDNode::get(Context, MDs); 7617 // Set operand 0 to refer to the loop id itself. 7618 NewLoopID->replaceOperandWith(0, NewLoopID); 7619 L->setLoopID(NewLoopID); 7620 } 7621 } 7622 7623 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, 7624 VPlan &BestVPlan, 7625 InnerLoopVectorizer &ILV, 7626 DominatorTree *DT, 7627 bool IsEpilogueVectorization) { 7628 assert(BestVPlan.hasVF(BestVF) && 7629 "Trying to execute plan with unsupported VF"); 7630 assert(BestVPlan.hasUF(BestUF) && 7631 "Trying to execute plan with unsupported UF"); 7632 7633 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF 7634 << '\n'); 7635 7636 // Workaround! Compute the trip count of the original loop and cache it 7637 // before we start modifying the CFG. This code has a systemic problem 7638 // wherein it tries to run analysis over partially constructed IR; this is 7639 // wrong, and not simply for SCEV. The trip count of the original loop 7640 // simply happens to be prone to hitting this in practice. In theory, we 7641 // can hit the same issue for any SCEV, or ValueTracking query done during 7642 // mutation. See PR49900. 7643 ILV.getOrCreateTripCount(OrigLoop->getLoopPreheader()); 7644 7645 if (!IsEpilogueVectorization) 7646 VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE); 7647 7648 // Perform the actual loop transformation. 7649 7650 // 1. Set up the skeleton for vectorization, including vector pre-header and 7651 // middle block. The vector loop is created during VPlan execution. 7652 VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; 7653 Value *CanonicalIVStartValue; 7654 std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = 7655 ILV.createVectorizedLoopSkeleton(); 7656 7657 // Only use noalias metadata when using memory checks guaranteeing no overlap 7658 // across all iterations. 7659 const LoopAccessInfo *LAI = ILV.Legal->getLAI(); 7660 if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() && 7661 !LAI->getRuntimePointerChecking()->getDiffChecks()) { 7662 7663 // We currently don't use LoopVersioning for the actual loop cloning but we 7664 // still use it to add the noalias metadata. 7665 // TODO: Find a better way to re-use LoopVersioning functionality to add 7666 // metadata. 7667 State.LVer = std::make_unique<LoopVersioning>( 7668 *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT, 7669 PSE.getSE()); 7670 State.LVer->prepareNoAliasMetadata(); 7671 } 7672 7673 ILV.collectPoisonGeneratingRecipes(State); 7674 7675 ILV.printDebugTracesAtStart(); 7676 7677 //===------------------------------------------------===// 7678 // 7679 // Notice: any optimization or new instruction that go 7680 // into the code below should also be implemented in 7681 // the cost-model. 7682 // 7683 //===------------------------------------------------===// 7684 7685 // 2. Copy and widen instructions from the old loop into the new loop. 7686 BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr), 7687 ILV.getOrCreateVectorTripCount(nullptr), 7688 CanonicalIVStartValue, State, 7689 IsEpilogueVectorization); 7690 7691 BestVPlan.execute(&State); 7692 7693 // Keep all loop hints from the original loop on the vector loop (we'll 7694 // replace the vectorizer-specific hints below). 7695 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7696 7697 std::optional<MDNode *> VectorizedLoopID = 7698 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7699 LLVMLoopVectorizeFollowupVectorized}); 7700 7701 VPBasicBlock *HeaderVPBB = 7702 BestVPlan.getVectorLoopRegion()->getEntryBasicBlock(); 7703 Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]); 7704 if (VectorizedLoopID) 7705 L->setLoopID(*VectorizedLoopID); 7706 else { 7707 // Keep all loop hints from the original loop on the vector loop (we'll 7708 // replace the vectorizer-specific hints below). 7709 if (MDNode *LID = OrigLoop->getLoopID()) 7710 L->setLoopID(LID); 7711 7712 LoopVectorizeHints Hints(L, true, *ORE); 7713 Hints.setAlreadyVectorized(); 7714 } 7715 AddRuntimeUnrollDisableMetaData(L); 7716 7717 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7718 // predication, updating analyses. 7719 ILV.fixVectorizedLoop(State, BestVPlan); 7720 7721 ILV.printDebugTracesAtEnd(); 7722 } 7723 7724 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 7725 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 7726 for (const auto &Plan : VPlans) 7727 if (PrintVPlansInDotFormat) 7728 Plan->printDOT(O); 7729 else 7730 Plan->print(O); 7731 } 7732 #endif 7733 7734 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 7735 7736 //===--------------------------------------------------------------------===// 7737 // EpilogueVectorizerMainLoop 7738 //===--------------------------------------------------------------------===// 7739 7740 /// This function is partially responsible for generating the control flow 7741 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7742 std::pair<BasicBlock *, Value *> 7743 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 7744 createVectorLoopSkeleton(""); 7745 7746 // Generate the code to check the minimum iteration count of the vector 7747 // epilogue (see below). 7748 EPI.EpilogueIterationCountCheck = 7749 emitIterationCountCheck(LoopScalarPreHeader, true); 7750 EPI.EpilogueIterationCountCheck->setName("iter.check"); 7751 7752 // Generate the code to check any assumptions that we've made for SCEV 7753 // expressions. 7754 EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader); 7755 7756 // Generate the code that checks at runtime if arrays overlap. We put the 7757 // checks into a separate block to make the more common case of few elements 7758 // faster. 7759 EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader); 7760 7761 // Generate the iteration count check for the main loop, *after* the check 7762 // for the epilogue loop, so that the path-length is shorter for the case 7763 // that goes directly through the vector epilogue. The longer-path length for 7764 // the main loop is compensated for, by the gain from vectorizing the larger 7765 // trip count. Note: the branch will get updated later on when we vectorize 7766 // the epilogue. 7767 EPI.MainLoopIterationCountCheck = 7768 emitIterationCountCheck(LoopScalarPreHeader, false); 7769 7770 // Generate the induction variable. 7771 EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 7772 7773 // Skip induction resume value creation here because they will be created in 7774 // the second pass for the scalar loop. The induction resume values for the 7775 // inductions in the epilogue loop are created before executing the plan for 7776 // the epilogue loop. 7777 7778 return {completeLoopSkeleton(), nullptr}; 7779 } 7780 7781 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 7782 LLVM_DEBUG({ 7783 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 7784 << "Main Loop VF:" << EPI.MainLoopVF 7785 << ", Main Loop UF:" << EPI.MainLoopUF 7786 << ", Epilogue Loop VF:" << EPI.EpilogueVF 7787 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7788 }); 7789 } 7790 7791 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 7792 DEBUG_WITH_TYPE(VerboseDebug, { 7793 dbgs() << "intermediate fn:\n" 7794 << *OrigLoop->getHeader()->getParent() << "\n"; 7795 }); 7796 } 7797 7798 BasicBlock * 7799 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, 7800 bool ForEpilogue) { 7801 assert(Bypass && "Expected valid bypass basic block."); 7802 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; 7803 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 7804 Value *Count = getOrCreateTripCount(LoopVectorPreHeader); 7805 // Reuse existing vector loop preheader for TC checks. 7806 // Note that new preheader block is generated for vector loop. 7807 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 7808 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 7809 7810 // Generate code to check if the loop's trip count is less than VF * UF of the 7811 // main vector loop. 7812 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ? 7813 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 7814 7815 Value *CheckMinIters = Builder.CreateICmp( 7816 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor), 7817 "min.iters.check"); 7818 7819 if (!ForEpilogue) 7820 TCCheckBlock->setName("vector.main.loop.iter.check"); 7821 7822 // Create new preheader for vector loop. 7823 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 7824 DT, LI, nullptr, "vector.ph"); 7825 7826 if (ForEpilogue) { 7827 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 7828 DT->getNode(Bypass)->getIDom()) && 7829 "TC check is expected to dominate Bypass"); 7830 7831 // Update dominator for Bypass & LoopExit. 7832 DT->changeImmediateDominator(Bypass, TCCheckBlock); 7833 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 7834 // For loops with multiple exits, there's no edge from the middle block 7835 // to exit blocks (as the epilogue must run) and thus no need to update 7836 // the immediate dominator of the exit blocks. 7837 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 7838 7839 LoopBypassBlocks.push_back(TCCheckBlock); 7840 7841 // Save the trip count so we don't have to regenerate it in the 7842 // vec.epilog.iter.check. This is safe to do because the trip count 7843 // generated here dominates the vector epilog iter check. 7844 EPI.TripCount = Count; 7845 } 7846 7847 ReplaceInstWithInst( 7848 TCCheckBlock->getTerminator(), 7849 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 7850 7851 return TCCheckBlock; 7852 } 7853 7854 //===--------------------------------------------------------------------===// 7855 // EpilogueVectorizerEpilogueLoop 7856 //===--------------------------------------------------------------------===// 7857 7858 /// This function is partially responsible for generating the control flow 7859 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7860 std::pair<BasicBlock *, Value *> 7861 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 7862 createVectorLoopSkeleton("vec.epilog."); 7863 7864 // Now, compare the remaining count and if there aren't enough iterations to 7865 // execute the vectorized epilogue skip to the scalar part. 7866 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 7867 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 7868 LoopVectorPreHeader = 7869 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 7870 LI, nullptr, "vec.epilog.ph"); 7871 emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader, 7872 VecEpilogueIterationCountCheck); 7873 7874 // Adjust the control flow taking the state info from the main loop 7875 // vectorization into account. 7876 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 7877 "expected this to be saved from the previous pass."); 7878 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 7879 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 7880 7881 DT->changeImmediateDominator(LoopVectorPreHeader, 7882 EPI.MainLoopIterationCountCheck); 7883 7884 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 7885 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7886 7887 if (EPI.SCEVSafetyCheck) 7888 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 7889 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7890 if (EPI.MemSafetyCheck) 7891 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 7892 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7893 7894 DT->changeImmediateDominator( 7895 VecEpilogueIterationCountCheck, 7896 VecEpilogueIterationCountCheck->getSinglePredecessor()); 7897 7898 DT->changeImmediateDominator(LoopScalarPreHeader, 7899 EPI.EpilogueIterationCountCheck); 7900 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 7901 // If there is an epilogue which must run, there's no edge from the 7902 // middle block to exit blocks and thus no need to update the immediate 7903 // dominator of the exit blocks. 7904 DT->changeImmediateDominator(LoopExitBlock, 7905 EPI.EpilogueIterationCountCheck); 7906 7907 // Keep track of bypass blocks, as they feed start values to the induction and 7908 // reduction phis in the scalar loop preheader. 7909 if (EPI.SCEVSafetyCheck) 7910 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 7911 if (EPI.MemSafetyCheck) 7912 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 7913 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 7914 7915 // The vec.epilog.iter.check block may contain Phi nodes from inductions or 7916 // reductions which merge control-flow from the latch block and the middle 7917 // block. Update the incoming values here and move the Phi into the preheader. 7918 SmallVector<PHINode *, 4> PhisInBlock; 7919 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis()) 7920 PhisInBlock.push_back(&Phi); 7921 7922 for (PHINode *Phi : PhisInBlock) { 7923 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI()); 7924 Phi->replaceIncomingBlockWith( 7925 VecEpilogueIterationCountCheck->getSinglePredecessor(), 7926 VecEpilogueIterationCountCheck); 7927 7928 // If the phi doesn't have an incoming value from the 7929 // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming 7930 // value and also those from other check blocks. This is needed for 7931 // reduction phis only. 7932 if (none_of(Phi->blocks(), [&](BasicBlock *IncB) { 7933 return EPI.EpilogueIterationCountCheck == IncB; 7934 })) 7935 continue; 7936 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck); 7937 if (EPI.SCEVSafetyCheck) 7938 Phi->removeIncomingValue(EPI.SCEVSafetyCheck); 7939 if (EPI.MemSafetyCheck) 7940 Phi->removeIncomingValue(EPI.MemSafetyCheck); 7941 } 7942 7943 // Generate a resume induction for the vector epilogue and put it in the 7944 // vector epilogue preheader 7945 Type *IdxTy = Legal->getWidestInductionType(); 7946 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 7947 LoopVectorPreHeader->getFirstNonPHI()); 7948 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 7949 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 7950 EPI.MainLoopIterationCountCheck); 7951 7952 // Generate induction resume values. These variables save the new starting 7953 // indexes for the scalar loop. They are used to test if there are any tail 7954 // iterations left once the vector loop has completed. 7955 // Note that when the vectorized epilogue is skipped due to iteration count 7956 // check, then the resume value for the induction variable comes from 7957 // the trip count of the main vector loop, hence passing the AdditionalBypass 7958 // argument. 7959 createInductionResumeValues({VecEpilogueIterationCountCheck, 7960 EPI.VectorTripCount} /* AdditionalBypass */); 7961 7962 return {completeLoopSkeleton(), EPResumeVal}; 7963 } 7964 7965 BasicBlock * 7966 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 7967 BasicBlock *Bypass, BasicBlock *Insert) { 7968 7969 assert(EPI.TripCount && 7970 "Expected trip count to have been safed in the first pass."); 7971 assert( 7972 (!isa<Instruction>(EPI.TripCount) || 7973 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 7974 "saved trip count does not dominate insertion point."); 7975 Value *TC = EPI.TripCount; 7976 IRBuilder<> Builder(Insert->getTerminator()); 7977 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 7978 7979 // Generate code to check if the loop's trip count is less than VF * UF of the 7980 // vector epilogue loop. 7981 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ? 7982 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 7983 7984 Value *CheckMinIters = 7985 Builder.CreateICmp(P, Count, 7986 createStepForVF(Builder, Count->getType(), 7987 EPI.EpilogueVF, EPI.EpilogueUF), 7988 "min.epilog.iters.check"); 7989 7990 ReplaceInstWithInst( 7991 Insert->getTerminator(), 7992 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 7993 7994 LoopBypassBlocks.push_back(Insert); 7995 return Insert; 7996 } 7997 7998 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 7999 LLVM_DEBUG({ 8000 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8001 << "Epilogue Loop VF:" << EPI.EpilogueVF 8002 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8003 }); 8004 } 8005 8006 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8007 DEBUG_WITH_TYPE(VerboseDebug, { 8008 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; 8009 }); 8010 } 8011 8012 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8013 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8014 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8015 bool PredicateAtRangeStart = Predicate(Range.Start); 8016 8017 for (ElementCount TmpVF = Range.Start * 2; 8018 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8019 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8020 Range.End = TmpVF; 8021 break; 8022 } 8023 8024 return PredicateAtRangeStart; 8025 } 8026 8027 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8028 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8029 /// of VF's starting at a given VF and extending it as much as possible. Each 8030 /// vectorization decision can potentially shorten this sub-range during 8031 /// buildVPlan(). 8032 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8033 ElementCount MaxVF) { 8034 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8035 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8036 VFRange SubRange = {VF, MaxVFPlusOne}; 8037 VPlans.push_back(buildVPlan(SubRange)); 8038 VF = SubRange.End; 8039 } 8040 } 8041 8042 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8043 VPlanPtr &Plan) { 8044 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8045 8046 // Look for cached value. 8047 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8048 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8049 if (ECEntryIt != EdgeMaskCache.end()) 8050 return ECEntryIt->second; 8051 8052 VPValue *SrcMask = createBlockInMask(Src, Plan); 8053 8054 // The terminator has to be a branch inst! 8055 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8056 assert(BI && "Unexpected terminator found"); 8057 8058 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8059 return EdgeMaskCache[Edge] = SrcMask; 8060 8061 // If source is an exiting block, we know the exit edge is dynamically dead 8062 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8063 // adding uses of an otherwise potentially dead instruction. 8064 if (OrigLoop->isLoopExiting(Src)) 8065 return EdgeMaskCache[Edge] = SrcMask; 8066 8067 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8068 assert(EdgeMask && "No Edge Mask found for condition"); 8069 8070 if (BI->getSuccessor(0) != Dst) 8071 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc()); 8072 8073 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8074 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8075 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8076 // The select version does not introduce new UB if SrcMask is false and 8077 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8078 VPValue *False = Plan->getOrAddVPValue( 8079 ConstantInt::getFalse(BI->getCondition()->getType())); 8080 EdgeMask = 8081 Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc()); 8082 } 8083 8084 return EdgeMaskCache[Edge] = EdgeMask; 8085 } 8086 8087 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8088 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8089 8090 // Look for cached value. 8091 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8092 if (BCEntryIt != BlockMaskCache.end()) 8093 return BCEntryIt->second; 8094 8095 // All-one mask is modelled as no-mask following the convention for masked 8096 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8097 VPValue *BlockMask = nullptr; 8098 8099 if (OrigLoop->getHeader() == BB) { 8100 if (!CM.blockNeedsPredicationForAnyReason(BB)) 8101 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8102 8103 assert(CM.foldTailByMasking() && "must fold the tail"); 8104 8105 // If we're using the active lane mask for control flow, then we get the 8106 // mask from the active lane mask PHI that is cached in the VPlan. 8107 PredicationStyle EmitGetActiveLaneMask = CM.TTI.emitGetActiveLaneMask(); 8108 if (EmitGetActiveLaneMask == PredicationStyle::DataAndControlFlow) 8109 return BlockMaskCache[BB] = Plan->getActiveLaneMaskPhi(); 8110 8111 // Introduce the early-exit compare IV <= BTC to form header block mask. 8112 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by 8113 // constructing the desired canonical IV in the header block as its first 8114 // non-phi instructions. 8115 8116 VPBasicBlock *HeaderVPBB = 8117 Plan->getVectorLoopRegion()->getEntryBasicBlock(); 8118 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); 8119 auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV()); 8120 HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi()); 8121 8122 VPBuilder::InsertPointGuard Guard(Builder); 8123 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); 8124 if (EmitGetActiveLaneMask != PredicationStyle::None) { 8125 VPValue *TC = Plan->getOrCreateTripCount(); 8126 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}, 8127 nullptr, "active.lane.mask"); 8128 } else { 8129 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8130 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8131 } 8132 return BlockMaskCache[BB] = BlockMask; 8133 } 8134 8135 // This is the block mask. We OR all incoming edges. 8136 for (auto *Predecessor : predecessors(BB)) { 8137 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8138 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8139 return BlockMaskCache[BB] = EdgeMask; 8140 8141 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8142 BlockMask = EdgeMask; 8143 continue; 8144 } 8145 8146 BlockMask = Builder.createOr(BlockMask, EdgeMask, {}); 8147 } 8148 8149 return BlockMaskCache[BB] = BlockMask; 8150 } 8151 8152 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8153 ArrayRef<VPValue *> Operands, 8154 VFRange &Range, 8155 VPlanPtr &Plan) { 8156 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8157 "Must be called with either a load or store"); 8158 8159 auto willWiden = [&](ElementCount VF) -> bool { 8160 LoopVectorizationCostModel::InstWidening Decision = 8161 CM.getWideningDecision(I, VF); 8162 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8163 "CM decision should be taken at this point."); 8164 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8165 return true; 8166 if (CM.isScalarAfterVectorization(I, VF) || 8167 CM.isProfitableToScalarize(I, VF)) 8168 return false; 8169 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8170 }; 8171 8172 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8173 return nullptr; 8174 8175 VPValue *Mask = nullptr; 8176 if (Legal->isMaskRequired(I)) 8177 Mask = createBlockInMask(I->getParent(), Plan); 8178 8179 // Determine if the pointer operand of the access is either consecutive or 8180 // reverse consecutive. 8181 LoopVectorizationCostModel::InstWidening Decision = 8182 CM.getWideningDecision(I, Range.Start); 8183 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; 8184 bool Consecutive = 8185 Reverse || Decision == LoopVectorizationCostModel::CM_Widen; 8186 8187 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8188 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask, 8189 Consecutive, Reverse); 8190 8191 StoreInst *Store = cast<StoreInst>(I); 8192 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8193 Mask, Consecutive, Reverse); 8194 } 8195 8196 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also 8197 /// insert a recipe to expand the step for the induction recipe. 8198 static VPWidenIntOrFpInductionRecipe *createWidenInductionRecipes( 8199 PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start, 8200 const InductionDescriptor &IndDesc, LoopVectorizationCostModel &CM, 8201 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, VFRange &Range) { 8202 // Returns true if an instruction \p I should be scalarized instead of 8203 // vectorized for the chosen vectorization factor. 8204 auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) { 8205 return CM.isScalarAfterVectorization(I, VF) || 8206 CM.isProfitableToScalarize(I, VF); 8207 }; 8208 8209 bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange( 8210 [&](ElementCount VF) { 8211 return ShouldScalarizeInstruction(PhiOrTrunc, VF); 8212 }, 8213 Range); 8214 assert(IndDesc.getStartValue() == 8215 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader())); 8216 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) && 8217 "step must be loop invariant"); 8218 8219 VPValue *Step = 8220 vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE); 8221 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) { 8222 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI, 8223 !NeedsScalarIVOnly); 8224 } 8225 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here"); 8226 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, 8227 !NeedsScalarIVOnly); 8228 } 8229 8230 VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI( 8231 PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) { 8232 8233 // Check if this is an integer or fp induction. If so, build the recipe that 8234 // produces its scalar and vector values. 8235 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) 8236 return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, CM, Plan, 8237 *PSE.getSE(), *OrigLoop, Range); 8238 8239 // Check if this is pointer induction. If so, build the recipe for it. 8240 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) { 8241 VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(), 8242 *PSE.getSE()); 8243 assert(isa<SCEVConstant>(II->getStep())); 8244 return new VPWidenPointerInductionRecipe( 8245 Phi, Operands[0], Step, *II, 8246 LoopVectorizationPlanner::getDecisionAndClampRange( 8247 [&](ElementCount VF) { 8248 return CM.isScalarAfterVectorization(Phi, VF); 8249 }, 8250 Range)); 8251 } 8252 return nullptr; 8253 } 8254 8255 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8256 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) { 8257 // Optimize the special case where the source is a constant integer 8258 // induction variable. Notice that we can only optimize the 'trunc' case 8259 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8260 // (c) other casts depend on pointer size. 8261 8262 // Determine whether \p K is a truncation based on an induction variable that 8263 // can be optimized. 8264 auto isOptimizableIVTruncate = 8265 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8266 return [=](ElementCount VF) -> bool { 8267 return CM.isOptimizableIVTruncate(K, VF); 8268 }; 8269 }; 8270 8271 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8272 isOptimizableIVTruncate(I), Range)) { 8273 8274 auto *Phi = cast<PHINode>(I->getOperand(0)); 8275 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); 8276 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8277 return createWidenInductionRecipes(Phi, I, Start, II, CM, Plan, 8278 *PSE.getSE(), *OrigLoop, Range); 8279 } 8280 return nullptr; 8281 } 8282 8283 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8284 ArrayRef<VPValue *> Operands, 8285 VPlanPtr &Plan) { 8286 // If all incoming values are equal, the incoming VPValue can be used directly 8287 // instead of creating a new VPBlendRecipe. 8288 if (llvm::all_equal(Operands)) 8289 return Operands[0]; 8290 8291 unsigned NumIncoming = Phi->getNumIncomingValues(); 8292 // For in-loop reductions, we do not need to create an additional select. 8293 VPValue *InLoopVal = nullptr; 8294 for (unsigned In = 0; In < NumIncoming; In++) { 8295 PHINode *PhiOp = 8296 dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue()); 8297 if (PhiOp && CM.isInLoopReduction(PhiOp)) { 8298 assert(!InLoopVal && "Found more than one in-loop reduction!"); 8299 InLoopVal = Operands[In]; 8300 } 8301 } 8302 8303 assert((!InLoopVal || NumIncoming == 2) && 8304 "Found an in-loop reduction for PHI with unexpected number of " 8305 "incoming values"); 8306 if (InLoopVal) 8307 return Operands[Operands[0] == InLoopVal ? 1 : 0]; 8308 8309 // We know that all PHIs in non-header blocks are converted into selects, so 8310 // we don't have to worry about the insertion order and we can just use the 8311 // builder. At this point we generate the predication tree. There may be 8312 // duplications since this is a simple recursive scan, but future 8313 // optimizations will clean it up. 8314 SmallVector<VPValue *, 2> OperandsWithMask; 8315 8316 for (unsigned In = 0; In < NumIncoming; In++) { 8317 VPValue *EdgeMask = 8318 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8319 assert((EdgeMask || NumIncoming == 1) && 8320 "Multiple predecessors with one having a full mask"); 8321 OperandsWithMask.push_back(Operands[In]); 8322 if (EdgeMask) 8323 OperandsWithMask.push_back(EdgeMask); 8324 } 8325 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8326 } 8327 8328 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8329 ArrayRef<VPValue *> Operands, 8330 VFRange &Range) const { 8331 8332 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8333 [this, CI](ElementCount VF) { 8334 return CM.isScalarWithPredication(CI, VF); 8335 }, 8336 Range); 8337 8338 if (IsPredicated) 8339 return nullptr; 8340 8341 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8342 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8343 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8344 ID == Intrinsic::pseudoprobe || 8345 ID == Intrinsic::experimental_noalias_scope_decl)) 8346 return nullptr; 8347 8348 ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size()); 8349 8350 // Is it beneficial to perform intrinsic call compared to lib call? 8351 bool ShouldUseVectorIntrinsic = 8352 ID && LoopVectorizationPlanner::getDecisionAndClampRange( 8353 [&](ElementCount VF) -> bool { 8354 bool NeedToScalarize = false; 8355 // Is it beneficial to perform intrinsic call compared to lib 8356 // call? 8357 InstructionCost CallCost = 8358 CM.getVectorCallCost(CI, VF, NeedToScalarize); 8359 InstructionCost IntrinsicCost = 8360 CM.getVectorIntrinsicCost(CI, VF); 8361 return IntrinsicCost <= CallCost; 8362 }, 8363 Range); 8364 if (ShouldUseVectorIntrinsic) 8365 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID); 8366 8367 // Is better to call a vectorized version of the function than to to scalarize 8368 // the call? 8369 auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange( 8370 [&](ElementCount VF) -> bool { 8371 // The following case may be scalarized depending on the VF. 8372 // The flag shows whether we can use a usual Call for vectorized 8373 // version of the instruction. 8374 bool NeedToScalarize = false; 8375 CM.getVectorCallCost(CI, VF, NeedToScalarize); 8376 return !NeedToScalarize; 8377 }, 8378 Range); 8379 if (ShouldUseVectorCall) 8380 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), 8381 Intrinsic::not_intrinsic); 8382 8383 return nullptr; 8384 } 8385 8386 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8387 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8388 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8389 // Instruction should be widened, unless it is scalar after vectorization, 8390 // scalarization is profitable or it is predicated. 8391 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8392 return CM.isScalarAfterVectorization(I, VF) || 8393 CM.isProfitableToScalarize(I, VF) || 8394 CM.isScalarWithPredication(I, VF); 8395 }; 8396 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8397 Range); 8398 } 8399 8400 VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I, 8401 ArrayRef<VPValue *> Operands, 8402 VPBasicBlock *VPBB, VPlanPtr &Plan) { 8403 switch (I->getOpcode()) { 8404 default: 8405 return nullptr; 8406 case Instruction::SDiv: 8407 case Instruction::UDiv: 8408 case Instruction::SRem: 8409 case Instruction::URem: { 8410 // If not provably safe, use a select to form a safe divisor before widening the 8411 // div/rem operation itself. Otherwise fall through to general handling below. 8412 if (CM.isPredicatedInst(I)) { 8413 SmallVector<VPValue *> Ops(Operands.begin(), Operands.end()); 8414 VPValue *Mask = createBlockInMask(I->getParent(), Plan); 8415 VPValue *One = 8416 Plan->getOrAddExternalDef(ConstantInt::get(I->getType(), 1u, false)); 8417 auto *SafeRHS = 8418 new VPInstruction(Instruction::Select, {Mask, Ops[1], One}, 8419 I->getDebugLoc()); 8420 VPBB->appendRecipe(SafeRHS); 8421 Ops[1] = SafeRHS; 8422 return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end())); 8423 } 8424 LLVM_FALLTHROUGH; 8425 } 8426 case Instruction::Add: 8427 case Instruction::And: 8428 case Instruction::AShr: 8429 case Instruction::BitCast: 8430 case Instruction::FAdd: 8431 case Instruction::FCmp: 8432 case Instruction::FDiv: 8433 case Instruction::FMul: 8434 case Instruction::FNeg: 8435 case Instruction::FPExt: 8436 case Instruction::FPToSI: 8437 case Instruction::FPToUI: 8438 case Instruction::FPTrunc: 8439 case Instruction::FRem: 8440 case Instruction::FSub: 8441 case Instruction::ICmp: 8442 case Instruction::IntToPtr: 8443 case Instruction::LShr: 8444 case Instruction::Mul: 8445 case Instruction::Or: 8446 case Instruction::PtrToInt: 8447 case Instruction::Select: 8448 case Instruction::SExt: 8449 case Instruction::Shl: 8450 case Instruction::SIToFP: 8451 case Instruction::Sub: 8452 case Instruction::Trunc: 8453 case Instruction::UIToFP: 8454 case Instruction::Xor: 8455 case Instruction::ZExt: 8456 case Instruction::Freeze: 8457 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8458 }; 8459 } 8460 8461 void VPRecipeBuilder::fixHeaderPhis() { 8462 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8463 for (VPHeaderPHIRecipe *R : PhisToFix) { 8464 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8465 VPRecipeBase *IncR = 8466 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8467 R->addOperand(IncR->getVPSingleValue()); 8468 } 8469 } 8470 8471 VPBasicBlock *VPRecipeBuilder::handleReplication( 8472 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8473 VPlanPtr &Plan) { 8474 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8475 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8476 Range); 8477 8478 bool IsPredicated = CM.isPredicatedInst(I); 8479 8480 // Even if the instruction is not marked as uniform, there are certain 8481 // intrinsic calls that can be effectively treated as such, so we check for 8482 // them here. Conservatively, we only do this for scalable vectors, since 8483 // for fixed-width VFs we can always fall back on full scalarization. 8484 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { 8485 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { 8486 case Intrinsic::assume: 8487 case Intrinsic::lifetime_start: 8488 case Intrinsic::lifetime_end: 8489 // For scalable vectors if one of the operands is variant then we still 8490 // want to mark as uniform, which will generate one instruction for just 8491 // the first lane of the vector. We can't scalarize the call in the same 8492 // way as for fixed-width vectors because we don't know how many lanes 8493 // there are. 8494 // 8495 // The reasons for doing it this way for scalable vectors are: 8496 // 1. For the assume intrinsic generating the instruction for the first 8497 // lane is still be better than not generating any at all. For 8498 // example, the input may be a splat across all lanes. 8499 // 2. For the lifetime start/end intrinsics the pointer operand only 8500 // does anything useful when the input comes from a stack object, 8501 // which suggests it should always be uniform. For non-stack objects 8502 // the effect is to poison the object, which still allows us to 8503 // remove the call. 8504 IsUniform = true; 8505 break; 8506 default: 8507 break; 8508 } 8509 } 8510 8511 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8512 IsUniform, IsPredicated); 8513 8514 // Find if I uses a predicated instruction. If so, it will use its scalar 8515 // value. Avoid hoisting the insert-element which packs the scalar value into 8516 // a vector value, as that happens iff all users use the vector value. 8517 for (VPValue *Op : Recipe->operands()) { 8518 auto *PredR = 8519 dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDefiningRecipe()); 8520 if (!PredR) 8521 continue; 8522 auto *RepR = cast<VPReplicateRecipe>( 8523 PredR->getOperand(0)->getDefiningRecipe()); 8524 assert(RepR->isPredicated() && 8525 "expected Replicate recipe to be predicated"); 8526 RepR->setAlsoPack(false); 8527 } 8528 8529 // Finalize the recipe for Instr, first if it is not predicated. 8530 if (!IsPredicated) { 8531 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8532 setRecipe(I, Recipe); 8533 Plan->addVPValue(I, Recipe); 8534 VPBB->appendRecipe(Recipe); 8535 return VPBB; 8536 } 8537 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8538 8539 VPBlockBase *SingleSucc = VPBB->getSingleSuccessor(); 8540 assert(SingleSucc && "VPBB must have a single successor when handling " 8541 "predicated replication."); 8542 VPBlockUtils::disconnectBlocks(VPBB, SingleSucc); 8543 // Record predicated instructions for above packing optimizations. 8544 VPBlockBase *Region = createReplicateRegion(Recipe, Plan); 8545 VPBlockUtils::insertBlockAfter(Region, VPBB); 8546 auto *RegSucc = new VPBasicBlock(); 8547 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8548 VPBlockUtils::connectBlocks(RegSucc, SingleSucc); 8549 return RegSucc; 8550 } 8551 8552 VPRegionBlock * 8553 VPRecipeBuilder::createReplicateRegion(VPReplicateRecipe *PredRecipe, 8554 VPlanPtr &Plan) { 8555 Instruction *Instr = PredRecipe->getUnderlyingInstr(); 8556 // Instructions marked for predication are replicated and placed under an 8557 // if-then construct to prevent side-effects. 8558 // Generate recipes to compute the block mask for this region. 8559 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8560 8561 // Build the triangular if-then region. 8562 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8563 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8564 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8565 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8566 auto *PHIRecipe = Instr->getType()->isVoidTy() 8567 ? nullptr 8568 : new VPPredInstPHIRecipe(PredRecipe); 8569 if (PHIRecipe) { 8570 setRecipe(Instr, PHIRecipe); 8571 Plan->addVPValue(Instr, PHIRecipe); 8572 } else { 8573 setRecipe(Instr, PredRecipe); 8574 Plan->addVPValue(Instr, PredRecipe); 8575 } 8576 8577 auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8578 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8579 VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true); 8580 8581 // Note: first set Entry as region entry and then connect successors starting 8582 // from it in order, to propagate the "parent" of each VPBasicBlock. 8583 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry); 8584 VPBlockUtils::connectBlocks(Pred, Exiting); 8585 8586 return Region; 8587 } 8588 8589 VPRecipeOrVPValueTy 8590 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8591 ArrayRef<VPValue *> Operands, 8592 VFRange &Range, VPBasicBlock *VPBB, 8593 VPlanPtr &Plan) { 8594 // First, check for specific widening recipes that deal with inductions, Phi 8595 // nodes, calls and memory operations. 8596 VPRecipeBase *Recipe; 8597 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8598 if (Phi->getParent() != OrigLoop->getHeader()) 8599 return tryToBlend(Phi, Operands, Plan); 8600 8601 // Always record recipes for header phis. Later first-order recurrence phis 8602 // can have earlier phis as incoming values. 8603 recordRecipeOf(Phi); 8604 8605 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range))) 8606 return toVPRecipeResult(Recipe); 8607 8608 VPHeaderPHIRecipe *PhiRecipe = nullptr; 8609 assert((Legal->isReductionVariable(Phi) || 8610 Legal->isFixedOrderRecurrence(Phi)) && 8611 "can only widen reductions and fixed-order recurrences here"); 8612 VPValue *StartV = Operands[0]; 8613 if (Legal->isReductionVariable(Phi)) { 8614 const RecurrenceDescriptor &RdxDesc = 8615 Legal->getReductionVars().find(Phi)->second; 8616 assert(RdxDesc.getRecurrenceStartValue() == 8617 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8618 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, 8619 CM.isInLoopReduction(Phi), 8620 CM.useOrderedReductions(RdxDesc)); 8621 } else { 8622 // TODO: Currently fixed-order recurrences are modeled as chains of 8623 // first-order recurrences. If there are no users of the intermediate 8624 // recurrences in the chain, the fixed order recurrence should be modeled 8625 // directly, enabling more efficient codegen. 8626 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); 8627 } 8628 8629 // Record the incoming value from the backedge, so we can add the incoming 8630 // value from the backedge after all recipes have been created. 8631 auto *Inc = cast<Instruction>( 8632 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())); 8633 auto RecipeIter = Ingredient2Recipe.find(Inc); 8634 if (RecipeIter == Ingredient2Recipe.end()) 8635 recordRecipeOf(Inc); 8636 8637 PhisToFix.push_back(PhiRecipe); 8638 return toVPRecipeResult(PhiRecipe); 8639 } 8640 8641 if (isa<TruncInst>(Instr) && 8642 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 8643 Range, *Plan))) 8644 return toVPRecipeResult(Recipe); 8645 8646 // All widen recipes below deal only with VF > 1. 8647 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8648 [&](ElementCount VF) { return VF.isScalar(); }, Range)) 8649 return nullptr; 8650 8651 if (auto *CI = dyn_cast<CallInst>(Instr)) 8652 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); 8653 8654 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8655 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 8656 8657 if (!shouldWiden(Instr, Range)) 8658 return nullptr; 8659 8660 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8661 return toVPRecipeResult(new VPWidenGEPRecipe( 8662 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); 8663 8664 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8665 bool InvariantCond = 8666 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8667 return toVPRecipeResult(new VPWidenSelectRecipe( 8668 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); 8669 } 8670 8671 return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan)); 8672 } 8673 8674 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8675 ElementCount MaxVF) { 8676 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8677 8678 // Add assume instructions we need to drop to DeadInstructions, to prevent 8679 // them from being added to the VPlan. 8680 // TODO: We only need to drop assumes in blocks that get flattend. If the 8681 // control flow is preserved, we should keep them. 8682 SmallPtrSet<Instruction *, 4> DeadInstructions; 8683 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8684 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8685 8686 MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8687 // Dead instructions do not need sinking. Remove them from SinkAfter. 8688 for (Instruction *I : DeadInstructions) 8689 SinkAfter.erase(I); 8690 8691 // Cannot sink instructions after dead instructions (there won't be any 8692 // recipes for them). Instead, find the first non-dead previous instruction. 8693 for (auto &P : Legal->getSinkAfter()) { 8694 Instruction *SinkTarget = P.second; 8695 Instruction *FirstInst = &*SinkTarget->getParent()->begin(); 8696 (void)FirstInst; 8697 while (DeadInstructions.contains(SinkTarget)) { 8698 assert( 8699 SinkTarget != FirstInst && 8700 "Must find a live instruction (at least the one feeding the " 8701 "fixed-order recurrence PHI) before reaching beginning of the block"); 8702 SinkTarget = SinkTarget->getPrevNode(); 8703 assert(SinkTarget != P.first && 8704 "sink source equals target, no sinking required"); 8705 } 8706 P.second = SinkTarget; 8707 } 8708 8709 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8710 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8711 VFRange SubRange = {VF, MaxVFPlusOne}; 8712 VPlans.push_back( 8713 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8714 VF = SubRange.End; 8715 } 8716 } 8717 8718 // Add the necessary canonical IV and branch recipes required to control the 8719 // loop. 8720 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, 8721 bool HasNUW, 8722 bool UseLaneMaskForLoopControlFlow) { 8723 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8724 auto *StartV = Plan.getOrAddVPValue(StartIdx); 8725 8726 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header. 8727 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); 8728 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); 8729 VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); 8730 Header->insert(CanonicalIVPHI, Header->begin()); 8731 8732 // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar 8733 // IV by VF * UF. 8734 auto *CanonicalIVIncrement = 8735 new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW 8736 : VPInstruction::CanonicalIVIncrement, 8737 {CanonicalIVPHI}, DL, "index.next"); 8738 CanonicalIVPHI->addOperand(CanonicalIVIncrement); 8739 8740 VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); 8741 EB->appendRecipe(CanonicalIVIncrement); 8742 8743 if (UseLaneMaskForLoopControlFlow) { 8744 // Create the active lane mask instruction in the vplan preheader. 8745 VPBasicBlock *Preheader = Plan.getEntry()->getEntryBasicBlock(); 8746 8747 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since 8748 // we have to take unrolling into account. Each part needs to start at 8749 // Part * VF 8750 auto *CanonicalIVIncrementParts = 8751 new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW 8752 : VPInstruction::CanonicalIVIncrementForPart, 8753 {StartV}, DL, "index.part.next"); 8754 Preheader->appendRecipe(CanonicalIVIncrementParts); 8755 8756 // Create the ActiveLaneMask instruction using the correct start values. 8757 VPValue *TC = Plan.getOrCreateTripCount(); 8758 auto *EntryALM = new VPInstruction(VPInstruction::ActiveLaneMask, 8759 {CanonicalIVIncrementParts, TC}, DL, 8760 "active.lane.mask.entry"); 8761 Preheader->appendRecipe(EntryALM); 8762 8763 // Now create the ActiveLaneMaskPhi recipe in the main loop using the 8764 // preheader ActiveLaneMask instruction. 8765 auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc()); 8766 Header->insert(LaneMaskPhi, Header->getFirstNonPhi()); 8767 8768 // Create the active lane mask for the next iteration of the loop. 8769 CanonicalIVIncrementParts = 8770 new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW 8771 : VPInstruction::CanonicalIVIncrementForPart, 8772 {CanonicalIVIncrement}, DL); 8773 EB->appendRecipe(CanonicalIVIncrementParts); 8774 8775 auto *ALM = new VPInstruction(VPInstruction::ActiveLaneMask, 8776 {CanonicalIVIncrementParts, TC}, DL, 8777 "active.lane.mask.next"); 8778 EB->appendRecipe(ALM); 8779 LaneMaskPhi->addOperand(ALM); 8780 8781 // We have to invert the mask here because a true condition means jumping 8782 // to the exit block. 8783 auto *NotMask = new VPInstruction(VPInstruction::Not, ALM, DL); 8784 EB->appendRecipe(NotMask); 8785 8786 VPInstruction *BranchBack = 8787 new VPInstruction(VPInstruction::BranchOnCond, {NotMask}, DL); 8788 EB->appendRecipe(BranchBack); 8789 } else { 8790 // Add the BranchOnCount VPInstruction to the latch. 8791 VPInstruction *BranchBack = new VPInstruction( 8792 VPInstruction::BranchOnCount, 8793 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); 8794 EB->appendRecipe(BranchBack); 8795 } 8796 } 8797 8798 // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the 8799 // original exit block. 8800 static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, 8801 VPBasicBlock *MiddleVPBB, Loop *OrigLoop, 8802 VPlan &Plan) { 8803 BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock(); 8804 BasicBlock *ExitingBB = OrigLoop->getExitingBlock(); 8805 // Only handle single-exit loops with unique exit blocks for now. 8806 if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB) 8807 return; 8808 8809 // Introduce VPUsers modeling the exit values. 8810 for (PHINode &ExitPhi : ExitBB->phis()) { 8811 Value *IncomingValue = 8812 ExitPhi.getIncomingValueForBlock(ExitingBB); 8813 VPValue *V = Plan.getOrAddVPValue(IncomingValue, true); 8814 Plan.addLiveOut(&ExitPhi, V); 8815 } 8816 } 8817 8818 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8819 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8820 const MapVector<Instruction *, Instruction *> &SinkAfter) { 8821 8822 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8823 8824 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8825 8826 // --------------------------------------------------------------------------- 8827 // Pre-construction: record ingredients whose recipes we'll need to further 8828 // process after constructing the initial VPlan. 8829 // --------------------------------------------------------------------------- 8830 8831 // Mark instructions we'll need to sink later and their targets as 8832 // ingredients whose recipe we'll need to record. 8833 for (const auto &Entry : SinkAfter) { 8834 RecipeBuilder.recordRecipeOf(Entry.first); 8835 RecipeBuilder.recordRecipeOf(Entry.second); 8836 } 8837 for (const auto &Reduction : CM.getInLoopReductionChains()) { 8838 PHINode *Phi = Reduction.first; 8839 RecurKind Kind = 8840 Legal->getReductionVars().find(Phi)->second.getRecurrenceKind(); 8841 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8842 8843 RecipeBuilder.recordRecipeOf(Phi); 8844 for (const auto &R : ReductionOperations) { 8845 RecipeBuilder.recordRecipeOf(R); 8846 // For min/max reductions, where we have a pair of icmp/select, we also 8847 // need to record the ICmp recipe, so it can be removed later. 8848 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 8849 "Only min/max recurrences allowed for inloop reductions"); 8850 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 8851 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 8852 } 8853 } 8854 8855 // For each interleave group which is relevant for this (possibly trimmed) 8856 // Range, add it to the set of groups to be later applied to the VPlan and add 8857 // placeholders for its members' Recipes which we'll be replacing with a 8858 // single VPInterleaveRecipe. 8859 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 8860 auto applyIG = [IG, this](ElementCount VF) -> bool { 8861 return (VF.isVector() && // Query is illegal for VF == 1 8862 CM.getWideningDecision(IG->getInsertPos(), VF) == 8863 LoopVectorizationCostModel::CM_Interleave); 8864 }; 8865 if (!getDecisionAndClampRange(applyIG, Range)) 8866 continue; 8867 InterleaveGroups.insert(IG); 8868 for (unsigned i = 0; i < IG->getFactor(); i++) 8869 if (Instruction *Member = IG->getMember(i)) 8870 RecipeBuilder.recordRecipeOf(Member); 8871 }; 8872 8873 // --------------------------------------------------------------------------- 8874 // Build initial VPlan: Scan the body of the loop in a topological order to 8875 // visit each basic block after having visited its predecessor basic blocks. 8876 // --------------------------------------------------------------------------- 8877 8878 // Create initial VPlan skeleton, starting with a block for the pre-header, 8879 // followed by a region for the vector loop, followed by the middle block. The 8880 // skeleton vector loop region contains a header and latch block. 8881 VPBasicBlock *Preheader = new VPBasicBlock("vector.ph"); 8882 auto Plan = std::make_unique<VPlan>(Preheader); 8883 8884 VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body"); 8885 VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); 8886 VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); 8887 auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop"); 8888 VPBlockUtils::insertBlockAfter(TopRegion, Preheader); 8889 VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block"); 8890 VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion); 8891 8892 Instruction *DLInst = 8893 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); 8894 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), 8895 DLInst ? DLInst->getDebugLoc() : DebugLoc(), 8896 !CM.foldTailByMasking(), 8897 CM.useActiveLaneMaskForControlFlow()); 8898 8899 // Scan the body of the loop in a topological order to visit each basic block 8900 // after having visited its predecessor basic blocks. 8901 LoopBlocksDFS DFS(OrigLoop); 8902 DFS.perform(LI); 8903 8904 VPBasicBlock *VPBB = HeaderVPBB; 8905 SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove; 8906 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 8907 // Relevant instructions from basic block BB will be grouped into VPRecipe 8908 // ingredients and fill a new VPBasicBlock. 8909 unsigned VPBBsForBB = 0; 8910 if (VPBB != HeaderVPBB) 8911 VPBB->setName(BB->getName()); 8912 Builder.setInsertPoint(VPBB); 8913 8914 // Introduce each ingredient into VPlan. 8915 // TODO: Model and preserve debug intrinsics in VPlan. 8916 for (Instruction &I : BB->instructionsWithoutDebug()) { 8917 Instruction *Instr = &I; 8918 8919 // First filter out irrelevant instructions, to ensure no recipes are 8920 // built for them. 8921 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 8922 continue; 8923 8924 SmallVector<VPValue *, 4> Operands; 8925 auto *Phi = dyn_cast<PHINode>(Instr); 8926 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 8927 Operands.push_back(Plan->getOrAddVPValue( 8928 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 8929 } else { 8930 auto OpRange = Plan->mapToVPValues(Instr->operands()); 8931 Operands = {OpRange.begin(), OpRange.end()}; 8932 } 8933 8934 // Invariant stores inside loop will be deleted and a single store 8935 // with the final reduction value will be added to the exit block 8936 StoreInst *SI; 8937 if ((SI = dyn_cast<StoreInst>(&I)) && 8938 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) 8939 continue; 8940 8941 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 8942 Instr, Operands, Range, VPBB, Plan)) { 8943 // If Instr can be simplified to an existing VPValue, use it. 8944 if (RecipeOrValue.is<VPValue *>()) { 8945 auto *VPV = RecipeOrValue.get<VPValue *>(); 8946 Plan->addVPValue(Instr, VPV); 8947 // If the re-used value is a recipe, register the recipe for the 8948 // instruction, in case the recipe for Instr needs to be recorded. 8949 if (VPRecipeBase *R = VPV->getDefiningRecipe()) 8950 RecipeBuilder.setRecipe(Instr, R); 8951 continue; 8952 } 8953 // Otherwise, add the new recipe. 8954 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 8955 for (auto *Def : Recipe->definedValues()) { 8956 auto *UV = Def->getUnderlyingValue(); 8957 Plan->addVPValue(UV, Def); 8958 } 8959 8960 if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && 8961 HeaderVPBB->getFirstNonPhi() != VPBB->end()) { 8962 // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section 8963 // of the header block. That can happen for truncates of induction 8964 // variables. Those recipes are moved to the phi section of the header 8965 // block after applying SinkAfter, which relies on the original 8966 // position of the trunc. 8967 assert(isa<TruncInst>(Instr)); 8968 InductionsToMove.push_back( 8969 cast<VPWidenIntOrFpInductionRecipe>(Recipe)); 8970 } 8971 RecipeBuilder.setRecipe(Instr, Recipe); 8972 VPBB->appendRecipe(Recipe); 8973 continue; 8974 } 8975 8976 // Otherwise, if all widening options failed, Instruction is to be 8977 // replicated. This may create a successor for VPBB. 8978 VPBasicBlock *NextVPBB = 8979 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 8980 if (NextVPBB != VPBB) { 8981 VPBB = NextVPBB; 8982 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 8983 : ""); 8984 } 8985 } 8986 8987 VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB); 8988 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor()); 8989 } 8990 8991 // After here, VPBB should not be used. 8992 VPBB = nullptr; 8993 8994 addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan); 8995 8996 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) && 8997 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && 8998 "entry block must be set to a VPRegionBlock having a non-empty entry " 8999 "VPBasicBlock"); 9000 RecipeBuilder.fixHeaderPhis(); 9001 9002 // --------------------------------------------------------------------------- 9003 // Transform initial VPlan: Apply previously taken decisions, in order, to 9004 // bring the VPlan to its final state. 9005 // --------------------------------------------------------------------------- 9006 9007 // Apply Sink-After legal constraints. 9008 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { 9009 auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); 9010 if (Region && Region->isReplicator()) { 9011 assert(Region->getNumSuccessors() == 1 && 9012 Region->getNumPredecessors() == 1 && "Expected SESE region!"); 9013 assert(R->getParent()->size() == 1 && 9014 "A recipe in an original replicator region must be the only " 9015 "recipe in its block"); 9016 return Region; 9017 } 9018 return nullptr; 9019 }; 9020 for (const auto &Entry : SinkAfter) { 9021 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 9022 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 9023 9024 auto *TargetRegion = GetReplicateRegion(Target); 9025 auto *SinkRegion = GetReplicateRegion(Sink); 9026 if (!SinkRegion) { 9027 // If the sink source is not a replicate region, sink the recipe directly. 9028 if (TargetRegion) { 9029 // The target is in a replication region, make sure to move Sink to 9030 // the block after it, not into the replication region itself. 9031 VPBasicBlock *NextBlock = 9032 cast<VPBasicBlock>(TargetRegion->getSuccessors().front()); 9033 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 9034 } else 9035 Sink->moveAfter(Target); 9036 continue; 9037 } 9038 9039 // The sink source is in a replicate region. Unhook the region from the CFG. 9040 auto *SinkPred = SinkRegion->getSinglePredecessor(); 9041 auto *SinkSucc = SinkRegion->getSingleSuccessor(); 9042 VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion); 9043 VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc); 9044 VPBlockUtils::connectBlocks(SinkPred, SinkSucc); 9045 9046 if (TargetRegion) { 9047 // The target recipe is also in a replicate region, move the sink region 9048 // after the target region. 9049 auto *TargetSucc = TargetRegion->getSingleSuccessor(); 9050 VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc); 9051 VPBlockUtils::connectBlocks(TargetRegion, SinkRegion); 9052 VPBlockUtils::connectBlocks(SinkRegion, TargetSucc); 9053 } else { 9054 // The sink source is in a replicate region, we need to move the whole 9055 // replicate region, which should only contain a single recipe in the 9056 // main block. 9057 auto *SplitBlock = 9058 Target->getParent()->splitAt(std::next(Target->getIterator())); 9059 9060 auto *SplitPred = SplitBlock->getSinglePredecessor(); 9061 9062 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); 9063 VPBlockUtils::connectBlocks(SplitPred, SinkRegion); 9064 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); 9065 } 9066 } 9067 9068 VPlanTransforms::removeRedundantCanonicalIVs(*Plan); 9069 VPlanTransforms::removeRedundantInductionCasts(*Plan); 9070 9071 // Now that sink-after is done, move induction recipes for optimized truncates 9072 // to the phi section of the header block. 9073 for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove) 9074 Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); 9075 9076 // Adjust the recipes for any inloop reductions. 9077 adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExiting()), Plan, 9078 RecipeBuilder, Range.Start); 9079 9080 // Introduce a recipe to combine the incoming and previous values of a 9081 // fixed-order recurrence. 9082 for (VPRecipeBase &R : 9083 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 9084 auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R); 9085 if (!RecurPhi) 9086 continue; 9087 9088 VPRecipeBase *PrevRecipe = &RecurPhi->getBackedgeRecipe(); 9089 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed 9090 // to terminate. 9091 while (auto *PrevPhi = 9092 dyn_cast<VPFirstOrderRecurrencePHIRecipe>(PrevRecipe)) 9093 PrevRecipe = &PrevPhi->getBackedgeRecipe(); 9094 VPBasicBlock *InsertBlock = PrevRecipe->getParent(); 9095 auto *Region = GetReplicateRegion(PrevRecipe); 9096 if (Region) 9097 InsertBlock = dyn_cast<VPBasicBlock>(Region->getSingleSuccessor()); 9098 if (!InsertBlock) { 9099 InsertBlock = new VPBasicBlock(Region->getName() + ".succ"); 9100 VPBlockUtils::insertBlockAfter(InsertBlock, Region); 9101 } 9102 if (Region || PrevRecipe->isPhi()) 9103 Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi()); 9104 else 9105 Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator())); 9106 9107 auto *RecurSplice = cast<VPInstruction>( 9108 Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, 9109 {RecurPhi, RecurPhi->getBackedgeValue()})); 9110 9111 RecurPhi->replaceAllUsesWith(RecurSplice); 9112 // Set the first operand of RecurSplice to RecurPhi again, after replacing 9113 // all users. 9114 RecurSplice->setOperand(0, RecurPhi); 9115 } 9116 9117 // Interleave memory: for each Interleave Group we marked earlier as relevant 9118 // for this VPlan, replace the Recipes widening its memory instructions with a 9119 // single VPInterleaveRecipe at its insertion point. 9120 for (const auto *IG : InterleaveGroups) { 9121 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 9122 RecipeBuilder.getRecipe(IG->getInsertPos())); 9123 SmallVector<VPValue *, 4> StoredValues; 9124 for (unsigned i = 0; i < IG->getFactor(); ++i) 9125 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) { 9126 auto *StoreR = 9127 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI)); 9128 StoredValues.push_back(StoreR->getStoredValue()); 9129 } 9130 9131 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 9132 Recipe->getMask()); 9133 VPIG->insertBefore(Recipe); 9134 unsigned J = 0; 9135 for (unsigned i = 0; i < IG->getFactor(); ++i) 9136 if (Instruction *Member = IG->getMember(i)) { 9137 if (!Member->getType()->isVoidTy()) { 9138 VPValue *OriginalV = Plan->getVPValue(Member); 9139 Plan->removeVPValueFor(Member); 9140 Plan->addVPValue(Member, VPIG->getVPValue(J)); 9141 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 9142 J++; 9143 } 9144 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 9145 } 9146 } 9147 9148 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9149 VF *= 2) 9150 Plan->addVF(VF); 9151 Plan->setName("Initial VPlan"); 9152 9153 // From this point onwards, VPlan-to-VPlan transformations may change the plan 9154 // in ways that accessing values using original IR values is incorrect. 9155 Plan->disableValue2VPValue(); 9156 9157 VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE()); 9158 VPlanTransforms::removeDeadRecipes(*Plan); 9159 9160 bool ShouldSimplify = true; 9161 while (ShouldSimplify) { 9162 ShouldSimplify = VPlanTransforms::sinkScalarOperands(*Plan); 9163 ShouldSimplify |= 9164 VPlanTransforms::mergeReplicateRegionsIntoSuccessors(*Plan); 9165 ShouldSimplify |= VPlanTransforms::mergeBlocksIntoPredecessors(*Plan); 9166 } 9167 9168 VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan); 9169 VPlanTransforms::mergeBlocksIntoPredecessors(*Plan); 9170 9171 assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); 9172 return Plan; 9173 } 9174 9175 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9176 // Outer loop handling: They may require CFG and instruction level 9177 // transformations before even evaluating whether vectorization is profitable. 9178 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9179 // the vectorization pipeline. 9180 assert(!OrigLoop->isInnermost()); 9181 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9182 9183 // Create new empty VPlan 9184 auto Plan = std::make_unique<VPlan>(); 9185 9186 // Build hierarchical CFG 9187 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9188 HCFGBuilder.buildHierarchicalCFG(); 9189 9190 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9191 VF *= 2) 9192 Plan->addVF(VF); 9193 9194 SmallPtrSet<Instruction *, 1> DeadInstructions; 9195 VPlanTransforms::VPInstructionsToVPRecipes( 9196 OrigLoop, Plan, 9197 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, 9198 DeadInstructions, *PSE.getSE(), *TLI); 9199 9200 // Remove the existing terminator of the exiting block of the top-most region. 9201 // A BranchOnCount will be added instead when adding the canonical IV recipes. 9202 auto *Term = 9203 Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator(); 9204 Term->eraseFromParent(); 9205 9206 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), 9207 true, CM.useActiveLaneMaskForControlFlow()); 9208 return Plan; 9209 } 9210 9211 // Adjust the recipes for reductions. For in-loop reductions the chain of 9212 // instructions leading from the loop exit instr to the phi need to be converted 9213 // to reductions, with one operand being vector and the other being the scalar 9214 // reduction chain. For other reductions, a select is introduced between the phi 9215 // and live-out recipes when folding the tail. 9216 void LoopVectorizationPlanner::adjustRecipesForReductions( 9217 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, 9218 ElementCount MinVF) { 9219 for (const auto &Reduction : CM.getInLoopReductionChains()) { 9220 PHINode *Phi = Reduction.first; 9221 const RecurrenceDescriptor &RdxDesc = 9222 Legal->getReductionVars().find(Phi)->second; 9223 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9224 9225 if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc)) 9226 continue; 9227 9228 // ReductionOperations are orders top-down from the phi's use to the 9229 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9230 // which of the two operands will remain scalar and which will be reduced. 9231 // For minmax the chain will be the select instructions. 9232 Instruction *Chain = Phi; 9233 for (Instruction *R : ReductionOperations) { 9234 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9235 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9236 9237 VPValue *ChainOp = Plan->getVPValue(Chain); 9238 unsigned FirstOpId; 9239 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9240 "Only min/max recurrences allowed for inloop reductions"); 9241 // Recognize a call to the llvm.fmuladd intrinsic. 9242 bool IsFMulAdd = (Kind == RecurKind::FMulAdd); 9243 assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) && 9244 "Expected instruction to be a call to the llvm.fmuladd intrinsic"); 9245 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9246 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9247 "Expected to replace a VPWidenSelectSC"); 9248 FirstOpId = 1; 9249 } else { 9250 assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) || 9251 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) && 9252 "Expected to replace a VPWidenSC"); 9253 FirstOpId = 0; 9254 } 9255 unsigned VecOpId = 9256 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9257 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9258 9259 VPValue *CondOp = nullptr; 9260 if (CM.blockNeedsPredicationForAnyReason(R->getParent())) { 9261 VPBuilder::InsertPointGuard Guard(Builder); 9262 Builder.setInsertPoint(WidenRecipe->getParent(), 9263 WidenRecipe->getIterator()); 9264 CondOp = RecipeBuilder.createBlockInMask(R->getParent(), Plan); 9265 } 9266 9267 if (IsFMulAdd) { 9268 // If the instruction is a call to the llvm.fmuladd intrinsic then we 9269 // need to create an fmul recipe to use as the vector operand for the 9270 // fadd reduction. 9271 VPInstruction *FMulRecipe = new VPInstruction( 9272 Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))}); 9273 FMulRecipe->setFastMathFlags(R->getFastMathFlags()); 9274 WidenRecipe->getParent()->insert(FMulRecipe, 9275 WidenRecipe->getIterator()); 9276 VecOp = FMulRecipe; 9277 } 9278 VPReductionRecipe *RedRecipe = 9279 new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9280 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9281 Plan->removeVPValueFor(R); 9282 Plan->addVPValue(R, RedRecipe); 9283 // Append the recipe to the end of the VPBasicBlock because we need to 9284 // ensure that it comes after all of it's inputs, including CondOp. 9285 WidenRecipe->getParent()->appendRecipe(RedRecipe); 9286 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9287 WidenRecipe->eraseFromParent(); 9288 9289 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9290 VPRecipeBase *CompareRecipe = 9291 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9292 assert(isa<VPWidenRecipe>(CompareRecipe) && 9293 "Expected to replace a VPWidenSC"); 9294 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9295 "Expected no remaining users"); 9296 CompareRecipe->eraseFromParent(); 9297 } 9298 Chain = R; 9299 } 9300 } 9301 9302 // If tail is folded by masking, introduce selects between the phi 9303 // and the live-out instruction of each reduction, at the beginning of the 9304 // dedicated latch block. 9305 if (CM.foldTailByMasking()) { 9306 Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin()); 9307 for (VPRecipeBase &R : 9308 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 9309 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9310 if (!PhiR || PhiR->isInLoop()) 9311 continue; 9312 VPValue *Cond = 9313 RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 9314 VPValue *Red = PhiR->getBackedgeValue(); 9315 assert(Red->getDefiningRecipe()->getParent() != LatchVPBB && 9316 "reduction recipe must be defined before latch"); 9317 Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); 9318 } 9319 } 9320 } 9321 9322 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9323 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9324 VPSlotTracker &SlotTracker) const { 9325 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9326 IG->getInsertPos()->printAsOperand(O, false); 9327 O << ", "; 9328 getAddr()->printAsOperand(O, SlotTracker); 9329 VPValue *Mask = getMask(); 9330 if (Mask) { 9331 O << ", "; 9332 Mask->printAsOperand(O, SlotTracker); 9333 } 9334 9335 unsigned OpIdx = 0; 9336 for (unsigned i = 0; i < IG->getFactor(); ++i) { 9337 if (!IG->getMember(i)) 9338 continue; 9339 if (getNumStoreOperands() > 0) { 9340 O << "\n" << Indent << " store "; 9341 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 9342 O << " to index " << i; 9343 } else { 9344 O << "\n" << Indent << " "; 9345 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 9346 O << " = load from index " << i; 9347 } 9348 ++OpIdx; 9349 } 9350 } 9351 #endif 9352 9353 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9354 assert(!State.Instance && "Int or FP induction being replicated."); 9355 9356 Value *Start = getStartValue()->getLiveInIRValue(); 9357 const InductionDescriptor &ID = getInductionDescriptor(); 9358 TruncInst *Trunc = getTruncInst(); 9359 IRBuilderBase &Builder = State.Builder; 9360 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 9361 assert(State.VF.isVector() && "must have vector VF"); 9362 9363 // The value from the original loop to which we are mapping the new induction 9364 // variable. 9365 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 9366 9367 // Fast-math-flags propagate from the original induction instruction. 9368 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 9369 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 9370 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 9371 9372 // Now do the actual transformations, and start with fetching the step value. 9373 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 9374 9375 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 9376 "Expected either an induction phi-node or a truncate of it!"); 9377 9378 // Construct the initial value of the vector IV in the vector loop preheader 9379 auto CurrIP = Builder.saveIP(); 9380 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 9381 Builder.SetInsertPoint(VectorPH->getTerminator()); 9382 if (isa<TruncInst>(EntryVal)) { 9383 assert(Start->getType()->isIntegerTy() && 9384 "Truncation requires an integer type"); 9385 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 9386 Step = Builder.CreateTrunc(Step, TruncType); 9387 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 9388 } 9389 9390 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); 9391 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); 9392 Value *SteppedStart = getStepVector( 9393 SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder); 9394 9395 // We create vector phi nodes for both integer and floating-point induction 9396 // variables. Here, we determine the kind of arithmetic we will perform. 9397 Instruction::BinaryOps AddOp; 9398 Instruction::BinaryOps MulOp; 9399 if (Step->getType()->isIntegerTy()) { 9400 AddOp = Instruction::Add; 9401 MulOp = Instruction::Mul; 9402 } else { 9403 AddOp = ID.getInductionOpcode(); 9404 MulOp = Instruction::FMul; 9405 } 9406 9407 // Multiply the vectorization factor by the step using integer or 9408 // floating-point arithmetic as appropriate. 9409 Type *StepType = Step->getType(); 9410 Value *RuntimeVF; 9411 if (Step->getType()->isFloatingPointTy()) 9412 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); 9413 else 9414 RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); 9415 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 9416 9417 // Create a vector splat to use in the induction update. 9418 // 9419 // FIXME: If the step is non-constant, we create the vector splat with 9420 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 9421 // handle a constant vector splat. 9422 Value *SplatVF = isa<Constant>(Mul) 9423 ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) 9424 : Builder.CreateVectorSplat(State.VF, Mul); 9425 Builder.restoreIP(CurrIP); 9426 9427 // We may need to add the step a number of times, depending on the unroll 9428 // factor. The last of those goes into the PHI. 9429 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 9430 &*State.CFG.PrevBB->getFirstInsertionPt()); 9431 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 9432 Instruction *LastInduction = VecInd; 9433 for (unsigned Part = 0; Part < State.UF; ++Part) { 9434 State.set(this, LastInduction, Part); 9435 9436 if (isa<TruncInst>(EntryVal)) 9437 State.addMetadata(LastInduction, EntryVal); 9438 9439 LastInduction = cast<Instruction>( 9440 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 9441 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 9442 } 9443 9444 LastInduction->setName("vec.ind.next"); 9445 VecInd->addIncoming(SteppedStart, VectorPH); 9446 // Add induction update using an incorrect block temporarily. The phi node 9447 // will be fixed after VPlan execution. Note that at this point the latch 9448 // block cannot be used, as it does not exist yet. 9449 // TODO: Model increment value in VPlan, by turning the recipe into a 9450 // multi-def and a subclass of VPHeaderPHIRecipe. 9451 VecInd->addIncoming(LastInduction, VectorPH); 9452 } 9453 9454 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { 9455 assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction && 9456 "Not a pointer induction according to InductionDescriptor!"); 9457 assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() && 9458 "Unexpected type."); 9459 9460 auto *IVR = getParent()->getPlan()->getCanonicalIV(); 9461 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0)); 9462 9463 if (onlyScalarsGenerated(State.VF)) { 9464 // This is the normalized GEP that starts counting at zero. 9465 Value *PtrInd = State.Builder.CreateSExtOrTrunc( 9466 CanonicalIV, IndDesc.getStep()->getType()); 9467 // Determine the number of scalars we need to generate for each unroll 9468 // iteration. If the instruction is uniform, we only need to generate the 9469 // first lane. Otherwise, we generate all VF values. 9470 bool IsUniform = vputils::onlyFirstLaneUsed(this); 9471 assert((IsUniform || !State.VF.isScalable()) && 9472 "Cannot scalarize a scalable VF"); 9473 unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); 9474 9475 for (unsigned Part = 0; Part < State.UF; ++Part) { 9476 Value *PartStart = 9477 createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part); 9478 9479 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 9480 Value *Idx = State.Builder.CreateAdd( 9481 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 9482 Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx); 9483 9484 Value *Step = State.get(getOperand(1), VPIteration(0, Part)); 9485 Value *SclrGep = emitTransformedIndex( 9486 State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, IndDesc); 9487 SclrGep->setName("next.gep"); 9488 State.set(this, SclrGep, VPIteration(Part, Lane)); 9489 } 9490 } 9491 return; 9492 } 9493 9494 assert(isa<SCEVConstant>(IndDesc.getStep()) && 9495 "Induction step not a SCEV constant!"); 9496 Type *PhiType = IndDesc.getStep()->getType(); 9497 9498 // Build a pointer phi 9499 Value *ScalarStartValue = getStartValue()->getLiveInIRValue(); 9500 Type *ScStValueType = ScalarStartValue->getType(); 9501 PHINode *NewPointerPhi = 9502 PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV); 9503 9504 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 9505 NewPointerPhi->addIncoming(ScalarStartValue, VectorPH); 9506 9507 // A pointer induction, performed by using a gep 9508 Instruction *InductionLoc = &*State.Builder.GetInsertPoint(); 9509 9510 Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0)); 9511 Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF); 9512 Value *NumUnrolledElems = 9513 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 9514 Value *InductionGEP = GetElementPtrInst::Create( 9515 IndDesc.getElementType(), NewPointerPhi, 9516 State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 9517 InductionLoc); 9518 // Add induction update using an incorrect block temporarily. The phi node 9519 // will be fixed after VPlan execution. Note that at this point the latch 9520 // block cannot be used, as it does not exist yet. 9521 // TODO: Model increment value in VPlan, by turning the recipe into a 9522 // multi-def and a subclass of VPHeaderPHIRecipe. 9523 NewPointerPhi->addIncoming(InductionGEP, VectorPH); 9524 9525 // Create UF many actual address geps that use the pointer 9526 // phi as base and a vectorized version of the step value 9527 // (<step*0, ..., step*N>) as offset. 9528 for (unsigned Part = 0; Part < State.UF; ++Part) { 9529 Type *VecPhiType = VectorType::get(PhiType, State.VF); 9530 Value *StartOffsetScalar = 9531 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 9532 Value *StartOffset = 9533 State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 9534 // Create a vector of consecutive numbers from zero to VF. 9535 StartOffset = State.Builder.CreateAdd( 9536 StartOffset, State.Builder.CreateStepVector(VecPhiType)); 9537 9538 assert(ScalarStepValue == State.get(getOperand(1), VPIteration(0, Part)) && 9539 "scalar step must be the same across all parts"); 9540 Value *GEP = State.Builder.CreateGEP( 9541 IndDesc.getElementType(), NewPointerPhi, 9542 State.Builder.CreateMul( 9543 StartOffset, 9544 State.Builder.CreateVectorSplat(State.VF, ScalarStepValue), 9545 "vector.gep")); 9546 State.set(this, GEP, Part); 9547 } 9548 } 9549 9550 void VPDerivedIVRecipe::execute(VPTransformState &State) { 9551 assert(!State.Instance && "VPDerivedIVRecipe being replicated."); 9552 9553 // Fast-math-flags propagate from the original induction instruction. 9554 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 9555 if (IndDesc.getInductionBinOp() && 9556 isa<FPMathOperator>(IndDesc.getInductionBinOp())) 9557 State.Builder.setFastMathFlags( 9558 IndDesc.getInductionBinOp()->getFastMathFlags()); 9559 9560 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 9561 Value *CanonicalIV = State.get(getCanonicalIV(), VPIteration(0, 0)); 9562 Value *DerivedIV = 9563 emitTransformedIndex(State.Builder, CanonicalIV, 9564 getStartValue()->getLiveInIRValue(), Step, IndDesc); 9565 DerivedIV->setName("offset.idx"); 9566 if (ResultTy != DerivedIV->getType()) { 9567 assert(Step->getType()->isIntegerTy() && 9568 "Truncation requires an integer step"); 9569 DerivedIV = State.Builder.CreateTrunc(DerivedIV, ResultTy); 9570 } 9571 assert(DerivedIV != CanonicalIV && "IV didn't need transforming?"); 9572 9573 State.set(this, DerivedIV, VPIteration(0, 0)); 9574 } 9575 9576 void VPScalarIVStepsRecipe::execute(VPTransformState &State) { 9577 // Fast-math-flags propagate from the original induction instruction. 9578 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 9579 if (IndDesc.getInductionBinOp() && 9580 isa<FPMathOperator>(IndDesc.getInductionBinOp())) 9581 State.Builder.setFastMathFlags( 9582 IndDesc.getInductionBinOp()->getFastMathFlags()); 9583 9584 Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0)); 9585 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 9586 9587 buildScalarSteps(BaseIV, Step, IndDesc, this, State); 9588 } 9589 9590 void VPInterleaveRecipe::execute(VPTransformState &State) { 9591 assert(!State.Instance && "Interleave group being replicated."); 9592 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9593 getStoredValues(), getMask()); 9594 } 9595 9596 void VPReductionRecipe::execute(VPTransformState &State) { 9597 assert(!State.Instance && "Reduction being replicated."); 9598 Value *PrevInChain = State.get(getChainOp(), 0); 9599 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9600 bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc); 9601 // Propagate the fast-math flags carried by the underlying instruction. 9602 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 9603 State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags()); 9604 for (unsigned Part = 0; Part < State.UF; ++Part) { 9605 Value *NewVecOp = State.get(getVecOp(), Part); 9606 if (VPValue *Cond = getCondOp()) { 9607 Value *NewCond = State.get(Cond, Part); 9608 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9609 Value *Iden = RdxDesc->getRecurrenceIdentity( 9610 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9611 Value *IdenVec = 9612 State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); 9613 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9614 NewVecOp = Select; 9615 } 9616 Value *NewRed; 9617 Value *NextInChain; 9618 if (IsOrdered) { 9619 if (State.VF.isVector()) 9620 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9621 PrevInChain); 9622 else 9623 NewRed = State.Builder.CreateBinOp( 9624 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain, 9625 NewVecOp); 9626 PrevInChain = NewRed; 9627 } else { 9628 PrevInChain = State.get(getChainOp(), Part); 9629 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9630 } 9631 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9632 NextInChain = 9633 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9634 NewRed, PrevInChain); 9635 } else if (IsOrdered) 9636 NextInChain = NewRed; 9637 else 9638 NextInChain = State.Builder.CreateBinOp( 9639 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed, 9640 PrevInChain); 9641 State.set(this, NextInChain, Part); 9642 } 9643 } 9644 9645 void VPReplicateRecipe::execute(VPTransformState &State) { 9646 Instruction *UI = getUnderlyingInstr(); 9647 if (State.Instance) { // Generate a single instance. 9648 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9649 State.ILV->scalarizeInstruction(UI, this, *State.Instance, 9650 IsPredicated, State); 9651 // Insert scalar instance packing it into a vector. 9652 if (AlsoPack && State.VF.isVector()) { 9653 // If we're constructing lane 0, initialize to start from poison. 9654 if (State.Instance->Lane.isFirstLane()) { 9655 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9656 Value *Poison = PoisonValue::get( 9657 VectorType::get(UI->getType(), State.VF)); 9658 State.set(this, Poison, State.Instance->Part); 9659 } 9660 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9661 } 9662 return; 9663 } 9664 9665 if (IsUniform) { 9666 // If the recipe is uniform across all parts (instead of just per VF), only 9667 // generate a single instance. 9668 if ((isa<LoadInst>(UI) || isa<StoreInst>(UI)) && 9669 all_of(operands(), [](VPValue *Op) { 9670 return Op->isDefinedOutsideVectorRegions(); 9671 })) { 9672 State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), IsPredicated, 9673 State); 9674 if (user_begin() != user_end()) { 9675 for (unsigned Part = 1; Part < State.UF; ++Part) 9676 State.set(this, State.get(this, VPIteration(0, 0)), 9677 VPIteration(Part, 0)); 9678 } 9679 return; 9680 } 9681 9682 // Uniform within VL means we need to generate lane 0 only for each 9683 // unrolled copy. 9684 for (unsigned Part = 0; Part < State.UF; ++Part) 9685 State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0), 9686 IsPredicated, State); 9687 return; 9688 } 9689 9690 // A store of a loop varying value to a loop invariant address only 9691 // needs only the last copy of the store. 9692 if (isa<StoreInst>(UI) && !getOperand(1)->hasDefiningRecipe()) { 9693 auto Lane = VPLane::getLastLaneForVF(State.VF); 9694 State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane), IsPredicated, 9695 State); 9696 return; 9697 } 9698 9699 // Generate scalar instances for all VF lanes of all UF parts. 9700 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9701 const unsigned EndLane = State.VF.getKnownMinValue(); 9702 for (unsigned Part = 0; Part < State.UF; ++Part) 9703 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9704 State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), 9705 IsPredicated, State); 9706 } 9707 9708 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9709 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9710 9711 // Attempt to issue a wide load. 9712 LoadInst *LI = dyn_cast<LoadInst>(&Ingredient); 9713 StoreInst *SI = dyn_cast<StoreInst>(&Ingredient); 9714 9715 assert((LI || SI) && "Invalid Load/Store instruction"); 9716 assert((!SI || StoredValue) && "No stored value provided for widened store"); 9717 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 9718 9719 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 9720 9721 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 9722 const Align Alignment = getLoadStoreAlignment(&Ingredient); 9723 bool CreateGatherScatter = !Consecutive; 9724 9725 auto &Builder = State.Builder; 9726 InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF); 9727 bool isMaskRequired = getMask(); 9728 if (isMaskRequired) 9729 for (unsigned Part = 0; Part < State.UF; ++Part) 9730 BlockInMaskParts[Part] = State.get(getMask(), Part); 9731 9732 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 9733 // Calculate the pointer for the specific unroll-part. 9734 GetElementPtrInst *PartPtr = nullptr; 9735 9736 bool InBounds = false; 9737 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 9738 InBounds = gep->isInBounds(); 9739 if (Reverse) { 9740 // If the address is consecutive but reversed, then the 9741 // wide store needs to start at the last vector element. 9742 // RunTimeVF = VScale * VF.getKnownMinValue() 9743 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 9744 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF); 9745 // NumElt = -Part * RunTimeVF 9746 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 9747 // LastLane = 1 - RunTimeVF 9748 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 9749 PartPtr = 9750 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 9751 PartPtr->setIsInBounds(InBounds); 9752 PartPtr = cast<GetElementPtrInst>( 9753 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 9754 PartPtr->setIsInBounds(InBounds); 9755 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 9756 BlockInMaskParts[Part] = 9757 Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse"); 9758 } else { 9759 Value *Increment = 9760 createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part); 9761 PartPtr = cast<GetElementPtrInst>( 9762 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 9763 PartPtr->setIsInBounds(InBounds); 9764 } 9765 9766 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 9767 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 9768 }; 9769 9770 // Handle Stores: 9771 if (SI) { 9772 State.setDebugLocFromInst(SI); 9773 9774 for (unsigned Part = 0; Part < State.UF; ++Part) { 9775 Instruction *NewSI = nullptr; 9776 Value *StoredVal = State.get(StoredValue, Part); 9777 if (CreateGatherScatter) { 9778 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 9779 Value *VectorGep = State.get(getAddr(), Part); 9780 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 9781 MaskPart); 9782 } else { 9783 if (Reverse) { 9784 // If we store to reverse consecutive memory locations, then we need 9785 // to reverse the order of elements in the stored value. 9786 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); 9787 // We don't want to update the value in the map as it might be used in 9788 // another expression. So don't call resetVectorValue(StoredVal). 9789 } 9790 auto *VecPtr = 9791 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 9792 if (isMaskRequired) 9793 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 9794 BlockInMaskParts[Part]); 9795 else 9796 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 9797 } 9798 State.addMetadata(NewSI, SI); 9799 } 9800 return; 9801 } 9802 9803 // Handle loads. 9804 assert(LI && "Must have a load instruction"); 9805 State.setDebugLocFromInst(LI); 9806 for (unsigned Part = 0; Part < State.UF; ++Part) { 9807 Value *NewLI; 9808 if (CreateGatherScatter) { 9809 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 9810 Value *VectorGep = State.get(getAddr(), Part); 9811 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, 9812 nullptr, "wide.masked.gather"); 9813 State.addMetadata(NewLI, LI); 9814 } else { 9815 auto *VecPtr = 9816 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 9817 if (isMaskRequired) 9818 NewLI = Builder.CreateMaskedLoad( 9819 DataTy, VecPtr, Alignment, BlockInMaskParts[Part], 9820 PoisonValue::get(DataTy), "wide.masked.load"); 9821 else 9822 NewLI = 9823 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 9824 9825 // Add metadata to the load, but setVectorValue to the reverse shuffle. 9826 State.addMetadata(NewLI, LI); 9827 if (Reverse) 9828 NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); 9829 } 9830 9831 State.set(getVPSingleValue(), NewLI, Part); 9832 } 9833 } 9834 9835 // Determine how to lower the scalar epilogue, which depends on 1) optimising 9836 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 9837 // predication, and 4) a TTI hook that analyses whether the loop is suitable 9838 // for predication. 9839 static ScalarEpilogueLowering getScalarEpilogueLowering( 9840 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 9841 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 9842 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 9843 LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) { 9844 // 1) OptSize takes precedence over all other options, i.e. if this is set, 9845 // don't look at hints or options, and don't request a scalar epilogue. 9846 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 9847 // LoopAccessInfo (due to code dependency and not being able to reliably get 9848 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 9849 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 9850 // versioning when the vectorization is forced, unlike hasOptSize. So revert 9851 // back to the old way and vectorize with versioning when forced. See D81345.) 9852 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 9853 PGSOQueryType::IRPass) && 9854 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 9855 return CM_ScalarEpilogueNotAllowedOptSize; 9856 9857 // 2) If set, obey the directives 9858 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 9859 switch (PreferPredicateOverEpilogue) { 9860 case PreferPredicateTy::ScalarEpilogue: 9861 return CM_ScalarEpilogueAllowed; 9862 case PreferPredicateTy::PredicateElseScalarEpilogue: 9863 return CM_ScalarEpilogueNotNeededUsePredicate; 9864 case PreferPredicateTy::PredicateOrDontVectorize: 9865 return CM_ScalarEpilogueNotAllowedUsePredicate; 9866 }; 9867 } 9868 9869 // 3) If set, obey the hints 9870 switch (Hints.getPredicate()) { 9871 case LoopVectorizeHints::FK_Enabled: 9872 return CM_ScalarEpilogueNotNeededUsePredicate; 9873 case LoopVectorizeHints::FK_Disabled: 9874 return CM_ScalarEpilogueAllowed; 9875 }; 9876 9877 // 4) if the TTI hook indicates this is profitable, request predication. 9878 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, &LVL, IAI)) 9879 return CM_ScalarEpilogueNotNeededUsePredicate; 9880 9881 return CM_ScalarEpilogueAllowed; 9882 } 9883 9884 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 9885 // If Values have been set for this Def return the one relevant for \p Part. 9886 if (hasVectorValue(Def, Part)) 9887 return Data.PerPartOutput[Def][Part]; 9888 9889 if (!hasScalarValue(Def, {Part, 0})) { 9890 Value *IRV = Def->getLiveInIRValue(); 9891 Value *B = ILV->getBroadcastInstrs(IRV); 9892 set(Def, B, Part); 9893 return B; 9894 } 9895 9896 Value *ScalarValue = get(Def, {Part, 0}); 9897 // If we aren't vectorizing, we can just copy the scalar map values over 9898 // to the vector map. 9899 if (VF.isScalar()) { 9900 set(Def, ScalarValue, Part); 9901 return ScalarValue; 9902 } 9903 9904 bool IsUniform = vputils::isUniformAfterVectorization(Def); 9905 9906 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 9907 // Check if there is a scalar value for the selected lane. 9908 if (!hasScalarValue(Def, {Part, LastLane})) { 9909 // At the moment, VPWidenIntOrFpInductionRecipes and VPScalarIVStepsRecipes can also be uniform. 9910 assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDefiningRecipe()) || 9911 isa<VPScalarIVStepsRecipe>(Def->getDefiningRecipe())) && 9912 "unexpected recipe found to be invariant"); 9913 IsUniform = true; 9914 LastLane = 0; 9915 } 9916 9917 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 9918 // Set the insert point after the last scalarized instruction or after the 9919 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence 9920 // will directly follow the scalar definitions. 9921 auto OldIP = Builder.saveIP(); 9922 auto NewIP = 9923 isa<PHINode>(LastInst) 9924 ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI()) 9925 : std::next(BasicBlock::iterator(LastInst)); 9926 Builder.SetInsertPoint(&*NewIP); 9927 9928 // However, if we are vectorizing, we need to construct the vector values. 9929 // If the value is known to be uniform after vectorization, we can just 9930 // broadcast the scalar value corresponding to lane zero for each unroll 9931 // iteration. Otherwise, we construct the vector values using 9932 // insertelement instructions. Since the resulting vectors are stored in 9933 // State, we will only generate the insertelements once. 9934 Value *VectorValue = nullptr; 9935 if (IsUniform) { 9936 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 9937 set(Def, VectorValue, Part); 9938 } else { 9939 // Initialize packing with insertelements to start from undef. 9940 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 9941 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 9942 set(Def, Undef, Part); 9943 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 9944 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 9945 VectorValue = get(Def, Part); 9946 } 9947 Builder.restoreIP(OldIP); 9948 return VectorValue; 9949 } 9950 9951 // Process the loop in the VPlan-native vectorization path. This path builds 9952 // VPlan upfront in the vectorization pipeline, which allows to apply 9953 // VPlan-to-VPlan transformations from the very beginning without modifying the 9954 // input LLVM IR. 9955 static bool processLoopInVPlanNativePath( 9956 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 9957 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 9958 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 9959 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 9960 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 9961 LoopVectorizationRequirements &Requirements) { 9962 9963 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 9964 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 9965 return false; 9966 } 9967 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 9968 Function *F = L->getHeader()->getParent(); 9969 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 9970 9971 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9972 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL, &IAI); 9973 9974 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 9975 &Hints, IAI); 9976 // Use the planner for outer loop vectorization. 9977 // TODO: CM is not used at this point inside the planner. Turn CM into an 9978 // optional argument if we don't need it in the future. 9979 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, ORE); 9980 9981 // Get user vectorization factor. 9982 ElementCount UserVF = Hints.getWidth(); 9983 9984 CM.collectElementTypesForWidening(); 9985 9986 // Plan how to best vectorize, return the best VF and its cost. 9987 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 9988 9989 // If we are stress testing VPlan builds, do not attempt to generate vector 9990 // code. Masked vector code generation support will follow soon. 9991 // Also, do not attempt to vectorize if no vector code will be produced. 9992 if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF) 9993 return false; 9994 9995 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 9996 9997 { 9998 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, 9999 F->getParent()->getDataLayout()); 10000 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 10001 VF.Width, 1, LVL, &CM, BFI, PSI, Checks); 10002 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 10003 << L->getHeader()->getParent()->getName() << "\"\n"); 10004 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false); 10005 } 10006 10007 // Mark the loop as already vectorized to avoid vectorizing again. 10008 Hints.setAlreadyVectorized(); 10009 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10010 return true; 10011 } 10012 10013 // Emit a remark if there are stores to floats that required a floating point 10014 // extension. If the vectorized loop was generated with floating point there 10015 // will be a performance penalty from the conversion overhead and the change in 10016 // the vector width. 10017 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 10018 SmallVector<Instruction *, 4> Worklist; 10019 for (BasicBlock *BB : L->getBlocks()) { 10020 for (Instruction &Inst : *BB) { 10021 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 10022 if (S->getValueOperand()->getType()->isFloatTy()) 10023 Worklist.push_back(S); 10024 } 10025 } 10026 } 10027 10028 // Traverse the floating point stores upwards searching, for floating point 10029 // conversions. 10030 SmallPtrSet<const Instruction *, 4> Visited; 10031 SmallPtrSet<const Instruction *, 4> EmittedRemark; 10032 while (!Worklist.empty()) { 10033 auto *I = Worklist.pop_back_val(); 10034 if (!L->contains(I)) 10035 continue; 10036 if (!Visited.insert(I).second) 10037 continue; 10038 10039 // Emit a remark if the floating point store required a floating 10040 // point conversion. 10041 // TODO: More work could be done to identify the root cause such as a 10042 // constant or a function return type and point the user to it. 10043 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 10044 ORE->emit([&]() { 10045 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 10046 I->getDebugLoc(), L->getHeader()) 10047 << "floating point conversion changes vector width. " 10048 << "Mixed floating point precision requires an up/down " 10049 << "cast that will negatively impact performance."; 10050 }); 10051 10052 for (Use &Op : I->operands()) 10053 if (auto *OpI = dyn_cast<Instruction>(Op)) 10054 Worklist.push_back(OpI); 10055 } 10056 } 10057 10058 static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, 10059 VectorizationFactor &VF, 10060 std::optional<unsigned> VScale, Loop *L, 10061 ScalarEvolution &SE) { 10062 InstructionCost CheckCost = Checks.getCost(); 10063 if (!CheckCost.isValid()) 10064 return false; 10065 10066 // When interleaving only scalar and vector cost will be equal, which in turn 10067 // would lead to a divide by 0. Fall back to hard threshold. 10068 if (VF.Width.isScalar()) { 10069 if (CheckCost > VectorizeMemoryCheckThreshold) { 10070 LLVM_DEBUG( 10071 dbgs() 10072 << "LV: Interleaving only is not profitable due to runtime checks\n"); 10073 return false; 10074 } 10075 return true; 10076 } 10077 10078 // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated. 10079 double ScalarC = *VF.ScalarCost.getValue(); 10080 if (ScalarC == 0) 10081 return true; 10082 10083 // First, compute the minimum iteration count required so that the vector 10084 // loop outperforms the scalar loop. 10085 // The total cost of the scalar loop is 10086 // ScalarC * TC 10087 // where 10088 // * TC is the actual trip count of the loop. 10089 // * ScalarC is the cost of a single scalar iteration. 10090 // 10091 // The total cost of the vector loop is 10092 // RtC + VecC * (TC / VF) + EpiC 10093 // where 10094 // * RtC is the cost of the generated runtime checks 10095 // * VecC is the cost of a single vector iteration. 10096 // * TC is the actual trip count of the loop 10097 // * VF is the vectorization factor 10098 // * EpiCost is the cost of the generated epilogue, including the cost 10099 // of the remaining scalar operations. 10100 // 10101 // Vectorization is profitable once the total vector cost is less than the 10102 // total scalar cost: 10103 // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC 10104 // 10105 // Now we can compute the minimum required trip count TC as 10106 // (RtC + EpiC) / (ScalarC - (VecC / VF)) < TC 10107 // 10108 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that 10109 // the computations are performed on doubles, not integers and the result 10110 // is rounded up, hence we get an upper estimate of the TC. 10111 unsigned IntVF = VF.Width.getKnownMinValue(); 10112 if (VF.Width.isScalable()) { 10113 unsigned AssumedMinimumVscale = 1; 10114 if (VScale) 10115 AssumedMinimumVscale = *VScale; 10116 IntVF *= AssumedMinimumVscale; 10117 } 10118 double VecCOverVF = double(*VF.Cost.getValue()) / IntVF; 10119 double RtC = *CheckCost.getValue(); 10120 double MinTC1 = RtC / (ScalarC - VecCOverVF); 10121 10122 // Second, compute a minimum iteration count so that the cost of the 10123 // runtime checks is only a fraction of the total scalar loop cost. This 10124 // adds a loop-dependent bound on the overhead incurred if the runtime 10125 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC 10126 // * TC. To bound the runtime check to be a fraction 1/X of the scalar 10127 // cost, compute 10128 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC 10129 double MinTC2 = RtC * 10 / ScalarC; 10130 10131 // Now pick the larger minimum. If it is not a multiple of VF, choose the 10132 // next closest multiple of VF. This should partly compensate for ignoring 10133 // the epilogue cost. 10134 uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2)); 10135 VF.MinProfitableTripCount = ElementCount::getFixed(alignTo(MinTC, IntVF)); 10136 10137 LLVM_DEBUG( 10138 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:" 10139 << VF.MinProfitableTripCount << "\n"); 10140 10141 // Skip vectorization if the expected trip count is less than the minimum 10142 // required trip count. 10143 if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) { 10144 if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC), 10145 VF.MinProfitableTripCount)) { 10146 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected " 10147 "trip count < minimum profitable VF (" 10148 << *ExpectedTC << " < " << VF.MinProfitableTripCount 10149 << ")\n"); 10150 10151 return false; 10152 } 10153 } 10154 return true; 10155 } 10156 10157 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 10158 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 10159 !EnableLoopInterleaving), 10160 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 10161 !EnableLoopVectorization) {} 10162 10163 bool LoopVectorizePass::processLoop(Loop *L) { 10164 assert((EnableVPlanNativePath || L->isInnermost()) && 10165 "VPlan-native path is not enabled. Only process inner loops."); 10166 10167 #ifndef NDEBUG 10168 const std::string DebugLocStr = getDebugLocString(L); 10169 #endif /* NDEBUG */ 10170 10171 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '" 10172 << L->getHeader()->getParent()->getName() << "' from " 10173 << DebugLocStr << "\n"); 10174 10175 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI); 10176 10177 LLVM_DEBUG( 10178 dbgs() << "LV: Loop hints:" 10179 << " force=" 10180 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 10181 ? "disabled" 10182 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 10183 ? "enabled" 10184 : "?")) 10185 << " width=" << Hints.getWidth() 10186 << " interleave=" << Hints.getInterleave() << "\n"); 10187 10188 // Function containing loop 10189 Function *F = L->getHeader()->getParent(); 10190 10191 // Looking at the diagnostic output is the only way to determine if a loop 10192 // was vectorized (other than looking at the IR or machine code), so it 10193 // is important to generate an optimization remark for each loop. Most of 10194 // these messages are generated as OptimizationRemarkAnalysis. Remarks 10195 // generated as OptimizationRemark and OptimizationRemarkMissed are 10196 // less verbose reporting vectorized loops and unvectorized loops that may 10197 // benefit from vectorization, respectively. 10198 10199 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 10200 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 10201 return false; 10202 } 10203 10204 PredicatedScalarEvolution PSE(*SE, *L); 10205 10206 // Check if it is legal to vectorize the loop. 10207 LoopVectorizationRequirements Requirements; 10208 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE, 10209 &Requirements, &Hints, DB, AC, BFI, PSI); 10210 if (!LVL.canVectorize(EnableVPlanNativePath)) { 10211 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 10212 Hints.emitRemarkWithHints(); 10213 return false; 10214 } 10215 10216 // Entrance to the VPlan-native vectorization path. Outer loops are processed 10217 // here. They may require CFG and instruction level transformations before 10218 // even evaluating whether vectorization is profitable. Since we cannot modify 10219 // the incoming IR, we need to build VPlan upfront in the vectorization 10220 // pipeline. 10221 if (!L->isInnermost()) 10222 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 10223 ORE, BFI, PSI, Hints, Requirements); 10224 10225 assert(L->isInnermost() && "Inner loop expected."); 10226 10227 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 10228 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 10229 10230 // If an override option has been passed in for interleaved accesses, use it. 10231 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 10232 UseInterleaved = EnableInterleavedMemAccesses; 10233 10234 // Analyze interleaved memory accesses. 10235 if (UseInterleaved) 10236 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 10237 10238 // Check the function attributes and profiles to find out if this function 10239 // should be optimized for size. 10240 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10241 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL, &IAI); 10242 10243 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 10244 // count by optimizing for size, to minimize overheads. 10245 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 10246 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 10247 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 10248 << "This loop is worth vectorizing only if no scalar " 10249 << "iteration overheads are incurred."); 10250 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 10251 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 10252 else { 10253 if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) { 10254 LLVM_DEBUG(dbgs() << "\n"); 10255 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 10256 } else { 10257 LLVM_DEBUG(dbgs() << " But the target considers the trip count too " 10258 "small to consider vectorizing.\n"); 10259 reportVectorizationFailure( 10260 "The trip count is below the minial threshold value.", 10261 "loop trip count is too low, avoiding vectorization", 10262 "LowTripCount", ORE, L); 10263 Hints.emitRemarkWithHints(); 10264 return false; 10265 } 10266 } 10267 } 10268 10269 // Check the function attributes to see if implicit floats or vectors are 10270 // allowed. 10271 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 10272 reportVectorizationFailure( 10273 "Can't vectorize when the NoImplicitFloat attribute is used", 10274 "loop not vectorized due to NoImplicitFloat attribute", 10275 "NoImplicitFloat", ORE, L); 10276 Hints.emitRemarkWithHints(); 10277 return false; 10278 } 10279 10280 // Check if the target supports potentially unsafe FP vectorization. 10281 // FIXME: Add a check for the type of safety issue (denormal, signaling) 10282 // for the target we're vectorizing for, to make sure none of the 10283 // additional fp-math flags can help. 10284 if (Hints.isPotentiallyUnsafe() && 10285 TTI->isFPVectorizationPotentiallyUnsafe()) { 10286 reportVectorizationFailure( 10287 "Potentially unsafe FP op prevents vectorization", 10288 "loop not vectorized due to unsafe FP support.", 10289 "UnsafeFP", ORE, L); 10290 Hints.emitRemarkWithHints(); 10291 return false; 10292 } 10293 10294 bool AllowOrderedReductions; 10295 // If the flag is set, use that instead and override the TTI behaviour. 10296 if (ForceOrderedReductions.getNumOccurrences() > 0) 10297 AllowOrderedReductions = ForceOrderedReductions; 10298 else 10299 AllowOrderedReductions = TTI->enableOrderedReductions(); 10300 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) { 10301 ORE->emit([&]() { 10302 auto *ExactFPMathInst = Requirements.getExactFPInst(); 10303 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 10304 ExactFPMathInst->getDebugLoc(), 10305 ExactFPMathInst->getParent()) 10306 << "loop not vectorized: cannot prove it is safe to reorder " 10307 "floating-point operations"; 10308 }); 10309 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 10310 "reorder floating-point operations\n"); 10311 Hints.emitRemarkWithHints(); 10312 return false; 10313 } 10314 10315 // Use the cost model. 10316 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 10317 F, &Hints, IAI); 10318 CM.collectValuesToIgnore(); 10319 CM.collectElementTypesForWidening(); 10320 10321 // Use the planner for vectorization. 10322 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, ORE); 10323 10324 // Get user vectorization factor and interleave count. 10325 ElementCount UserVF = Hints.getWidth(); 10326 unsigned UserIC = Hints.getInterleave(); 10327 10328 // Plan how to best vectorize, return the best VF and its cost. 10329 std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 10330 10331 VectorizationFactor VF = VectorizationFactor::Disabled(); 10332 unsigned IC = 1; 10333 10334 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, 10335 F->getParent()->getDataLayout()); 10336 if (MaybeVF) { 10337 VF = *MaybeVF; 10338 // Select the interleave count. 10339 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 10340 10341 unsigned SelectedIC = std::max(IC, UserIC); 10342 // Optimistically generate runtime checks if they are needed. Drop them if 10343 // they turn out to not be profitable. 10344 if (VF.Width.isVector() || SelectedIC > 1) 10345 Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC); 10346 10347 // Check if it is profitable to vectorize with runtime checks. 10348 bool ForceVectorization = 10349 Hints.getForce() == LoopVectorizeHints::FK_Enabled; 10350 if (!ForceVectorization && 10351 !areRuntimeChecksProfitable(Checks, VF, CM.getVScaleForTuning(), L, 10352 *PSE.getSE())) { 10353 ORE->emit([&]() { 10354 return OptimizationRemarkAnalysisAliasing( 10355 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(), 10356 L->getHeader()) 10357 << "loop not vectorized: cannot prove it is safe to reorder " 10358 "memory operations"; 10359 }); 10360 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 10361 Hints.emitRemarkWithHints(); 10362 return false; 10363 } 10364 } 10365 10366 // Identify the diagnostic messages that should be produced. 10367 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10368 bool VectorizeLoop = true, InterleaveLoop = true; 10369 if (VF.Width.isScalar()) { 10370 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10371 VecDiagMsg = std::make_pair( 10372 "VectorizationNotBeneficial", 10373 "the cost-model indicates that vectorization is not beneficial"); 10374 VectorizeLoop = false; 10375 } 10376 10377 if (!MaybeVF && UserIC > 1) { 10378 // Tell the user interleaving was avoided up-front, despite being explicitly 10379 // requested. 10380 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10381 "interleaving should be avoided up front\n"); 10382 IntDiagMsg = std::make_pair( 10383 "InterleavingAvoided", 10384 "Ignoring UserIC, because interleaving was avoided up front"); 10385 InterleaveLoop = false; 10386 } else if (IC == 1 && UserIC <= 1) { 10387 // Tell the user interleaving is not beneficial. 10388 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10389 IntDiagMsg = std::make_pair( 10390 "InterleavingNotBeneficial", 10391 "the cost-model indicates that interleaving is not beneficial"); 10392 InterleaveLoop = false; 10393 if (UserIC == 1) { 10394 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10395 IntDiagMsg.second += 10396 " and is explicitly disabled or interleave count is set to 1"; 10397 } 10398 } else if (IC > 1 && UserIC == 1) { 10399 // Tell the user interleaving is beneficial, but it explicitly disabled. 10400 LLVM_DEBUG( 10401 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10402 IntDiagMsg = std::make_pair( 10403 "InterleavingBeneficialButDisabled", 10404 "the cost-model indicates that interleaving is beneficial " 10405 "but is explicitly disabled or interleave count is set to 1"); 10406 InterleaveLoop = false; 10407 } 10408 10409 // Override IC if user provided an interleave count. 10410 IC = UserIC > 0 ? UserIC : IC; 10411 10412 // Emit diagnostic messages, if any. 10413 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10414 if (!VectorizeLoop && !InterleaveLoop) { 10415 // Do not vectorize or interleaving the loop. 10416 ORE->emit([&]() { 10417 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10418 L->getStartLoc(), L->getHeader()) 10419 << VecDiagMsg.second; 10420 }); 10421 ORE->emit([&]() { 10422 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10423 L->getStartLoc(), L->getHeader()) 10424 << IntDiagMsg.second; 10425 }); 10426 return false; 10427 } else if (!VectorizeLoop && InterleaveLoop) { 10428 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10429 ORE->emit([&]() { 10430 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10431 L->getStartLoc(), L->getHeader()) 10432 << VecDiagMsg.second; 10433 }); 10434 } else if (VectorizeLoop && !InterleaveLoop) { 10435 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10436 << ") in " << DebugLocStr << '\n'); 10437 ORE->emit([&]() { 10438 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10439 L->getStartLoc(), L->getHeader()) 10440 << IntDiagMsg.second; 10441 }); 10442 } else if (VectorizeLoop && InterleaveLoop) { 10443 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10444 << ") in " << DebugLocStr << '\n'); 10445 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10446 } 10447 10448 bool DisableRuntimeUnroll = false; 10449 MDNode *OrigLoopID = L->getLoopID(); 10450 { 10451 using namespace ore; 10452 if (!VectorizeLoop) { 10453 assert(IC > 1 && "interleave count should not be 1 or 0"); 10454 // If we decided that it is not legal to vectorize the loop, then 10455 // interleave it. 10456 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10457 &CM, BFI, PSI, Checks); 10458 10459 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10460 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false); 10461 10462 ORE->emit([&]() { 10463 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10464 L->getHeader()) 10465 << "interleaved loop (interleaved count: " 10466 << NV("InterleaveCount", IC) << ")"; 10467 }); 10468 } else { 10469 // If we decided that it is *legal* to vectorize the loop, then do it. 10470 10471 // Consider vectorizing the epilogue too if it's profitable. 10472 VectorizationFactor EpilogueVF = 10473 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 10474 if (EpilogueVF.Width.isVector()) { 10475 10476 // The first pass vectorizes the main loop and creates a scalar epilogue 10477 // to be vectorized by executing the plan (potentially with a different 10478 // factor) again shortly afterwards. 10479 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); 10480 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10481 EPI, &LVL, &CM, BFI, PSI, Checks); 10482 10483 VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); 10484 LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, 10485 DT, true); 10486 ++LoopsVectorized; 10487 10488 // Second pass vectorizes the epilogue and adjusts the control flow 10489 // edges from the first pass. 10490 EPI.MainLoopVF = EPI.EpilogueVF; 10491 EPI.MainLoopUF = EPI.EpilogueUF; 10492 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10493 ORE, EPI, &LVL, &CM, BFI, PSI, 10494 Checks); 10495 10496 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); 10497 VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion(); 10498 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock(); 10499 Header->setName("vec.epilog.vector.body"); 10500 10501 // Ensure that the start values for any VPWidenIntOrFpInductionRecipe, 10502 // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated 10503 // before vectorizing the epilogue loop. 10504 for (VPRecipeBase &R : Header->phis()) { 10505 if (isa<VPCanonicalIVPHIRecipe>(&R)) 10506 continue; 10507 10508 Value *ResumeV = nullptr; 10509 // TODO: Move setting of resume values to prepareToExecute. 10510 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) { 10511 ResumeV = MainILV.getReductionResumeValue( 10512 ReductionPhi->getRecurrenceDescriptor()); 10513 } else { 10514 // Create induction resume values for both widened pointer and 10515 // integer/fp inductions and update the start value of the induction 10516 // recipes to use the resume value. 10517 PHINode *IndPhi = nullptr; 10518 const InductionDescriptor *ID; 10519 if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) { 10520 IndPhi = cast<PHINode>(Ind->getUnderlyingValue()); 10521 ID = &Ind->getInductionDescriptor(); 10522 } else { 10523 auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R); 10524 IndPhi = WidenInd->getPHINode(); 10525 ID = &WidenInd->getInductionDescriptor(); 10526 } 10527 10528 ResumeV = MainILV.createInductionResumeValue( 10529 IndPhi, *ID, {EPI.MainLoopIterationCountCheck}); 10530 } 10531 assert(ResumeV && "Must have a resume value"); 10532 VPValue *StartVal = BestEpiPlan.getOrAddExternalDef(ResumeV); 10533 cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal); 10534 } 10535 10536 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, 10537 DT, true); 10538 ++LoopsEpilogueVectorized; 10539 10540 if (!MainILV.areSafetyChecksAdded()) 10541 DisableRuntimeUnroll = true; 10542 } else { 10543 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 10544 VF.MinProfitableTripCount, IC, &LVL, &CM, BFI, 10545 PSI, Checks); 10546 10547 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10548 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false); 10549 ++LoopsVectorized; 10550 10551 // Add metadata to disable runtime unrolling a scalar loop when there 10552 // are no runtime checks about strides and memory. A scalar loop that is 10553 // rarely used is not worth unrolling. 10554 if (!LB.areSafetyChecksAdded()) 10555 DisableRuntimeUnroll = true; 10556 } 10557 // Report the vectorization decision. 10558 ORE->emit([&]() { 10559 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 10560 L->getHeader()) 10561 << "vectorized loop (vectorization width: " 10562 << NV("VectorizationFactor", VF.Width) 10563 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 10564 }); 10565 } 10566 10567 if (ORE->allowExtraAnalysis(LV_NAME)) 10568 checkMixedPrecision(L, ORE); 10569 } 10570 10571 std::optional<MDNode *> RemainderLoopID = 10572 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10573 LLVMLoopVectorizeFollowupEpilogue}); 10574 if (RemainderLoopID) { 10575 L->setLoopID(*RemainderLoopID); 10576 } else { 10577 if (DisableRuntimeUnroll) 10578 AddRuntimeUnrollDisableMetaData(L); 10579 10580 // Mark the loop as already vectorized to avoid vectorizing again. 10581 Hints.setAlreadyVectorized(); 10582 } 10583 10584 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10585 return true; 10586 } 10587 10588 LoopVectorizeResult LoopVectorizePass::runImpl( 10589 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10590 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 10591 DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_, 10592 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10593 SE = &SE_; 10594 LI = &LI_; 10595 TTI = &TTI_; 10596 DT = &DT_; 10597 BFI = &BFI_; 10598 TLI = TLI_; 10599 AC = &AC_; 10600 LAIs = &LAIs_; 10601 DB = &DB_; 10602 ORE = &ORE_; 10603 PSI = PSI_; 10604 10605 // Don't attempt if 10606 // 1. the target claims to have no vector registers, and 10607 // 2. interleaving won't help ILP. 10608 // 10609 // The second condition is necessary because, even if the target has no 10610 // vector registers, loop vectorization may still enable scalar 10611 // interleaving. 10612 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10613 TTI->getMaxInterleaveFactor(1) < 2) 10614 return LoopVectorizeResult(false, false); 10615 10616 bool Changed = false, CFGChanged = false; 10617 10618 // The vectorizer requires loops to be in simplified form. 10619 // Since simplification may add new inner loops, it has to run before the 10620 // legality and profitability checks. This means running the loop vectorizer 10621 // will simplify all loops, regardless of whether anything end up being 10622 // vectorized. 10623 for (const auto &L : *LI) 10624 Changed |= CFGChanged |= 10625 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10626 10627 // Build up a worklist of inner-loops to vectorize. This is necessary as 10628 // the act of vectorizing or partially unrolling a loop creates new loops 10629 // and can invalidate iterators across the loops. 10630 SmallVector<Loop *, 8> Worklist; 10631 10632 for (Loop *L : *LI) 10633 collectSupportedLoops(*L, LI, ORE, Worklist); 10634 10635 LoopsAnalyzed += Worklist.size(); 10636 10637 // Now walk the identified inner loops. 10638 while (!Worklist.empty()) { 10639 Loop *L = Worklist.pop_back_val(); 10640 10641 // For the inner loops we actually process, form LCSSA to simplify the 10642 // transform. 10643 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10644 10645 Changed |= CFGChanged |= processLoop(L); 10646 10647 if (Changed) 10648 LAIs->clear(); 10649 } 10650 10651 // Process each loop nest in the function. 10652 return LoopVectorizeResult(Changed, CFGChanged); 10653 } 10654 10655 PreservedAnalyses LoopVectorizePass::run(Function &F, 10656 FunctionAnalysisManager &AM) { 10657 auto &LI = AM.getResult<LoopAnalysis>(F); 10658 // There are no loops in the function. Return before computing other expensive 10659 // analyses. 10660 if (LI.empty()) 10661 return PreservedAnalyses::all(); 10662 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10663 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10664 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10665 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 10666 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10667 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10668 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10669 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10670 10671 LoopAccessInfoManager &LAIs = AM.getResult<LoopAccessAnalysis>(F); 10672 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10673 ProfileSummaryInfo *PSI = 10674 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10675 LoopVectorizeResult Result = 10676 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AC, LAIs, ORE, PSI); 10677 if (!Result.MadeAnyChange) 10678 return PreservedAnalyses::all(); 10679 PreservedAnalyses PA; 10680 10681 // We currently do not preserve loopinfo/dominator analyses with outer loop 10682 // vectorization. Until this is addressed, mark these analyses as preserved 10683 // only for non-VPlan-native path. 10684 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10685 if (!EnableVPlanNativePath) { 10686 PA.preserve<LoopAnalysis>(); 10687 PA.preserve<DominatorTreeAnalysis>(); 10688 } 10689 10690 if (Result.MadeCFGChange) { 10691 // Making CFG changes likely means a loop got vectorized. Indicate that 10692 // extra simplification passes should be run. 10693 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only 10694 // be run if runtime checks have been added. 10695 AM.getResult<ShouldRunExtraVectorPasses>(F); 10696 PA.preserve<ShouldRunExtraVectorPasses>(); 10697 } else { 10698 PA.preserveSet<CFGAnalyses>(); 10699 } 10700 return PA; 10701 } 10702 10703 void LoopVectorizePass::printPipeline( 10704 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 10705 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( 10706 OS, MapClassName2PassName); 10707 10708 OS << "<"; 10709 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;"; 10710 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;"; 10711 OS << ">"; 10712 } 10713